## Create User Profiles

Create the latest profiles based on training and testing data to have base for updates

In [2]:
import pandas as pd

In [3]:
import sqlalchemy as db
from sqlalchemy import create_engine
import psycopg2

In [4]:
DATABASE_URI = 'postgres+psycopg2://postgres:power2020@host.docker.internal:5431/person_recommender'
engine = create_engine(DATABASE_URI)
meta = db.MetaData(engine)

In [5]:
training_enc = db.Table("target_training_enc", meta, autoload = True)
testing_enc = db.Table("target_testing_enc", meta, autoload = True)

### Create Table

In [6]:
from sqlalchemy import Column, Integer, String, DateTime, Float
from sqlalchemy import create_engine, MetaData, Table, inspect
from sqlalchemy.dialects import postgresql
import datetime

In [88]:
#user_enc.drop(engine)

In [7]:
user_enc = Table('user_enc', meta,
                     # info
                 Column('userid', String, primary_key=True),
                 Column('datum_click', DateTime),
                 Column('clicked_before', postgresql.ARRAY(String)),
                     # user
                 Column('usermkt_enc', Integer),
                 Column('anbieterid_enc_user', postgresql.ARRAY(Integer)),
                 Column('anbietermarktplatz_enc_user', postgresql.ARRAY(Integer)),
                 Column('warengruppe_enc_user', postgresql.ARRAY(Integer)),
                 Column('text_vec_user', postgresql.ARRAY(Float)),
                 Column('preis_std_user', Float),
                 Column('minve_std_user', Float),
                 Column('minve_log_std_user', Float),
                 Column('preis_log_std_user', Float))
user_enc.create()

### Extract Max Representation for each User

Test data first

In [8]:
query = """select t.*
from target_testing_enc as t
inner join (
    select tm.userid, max(tm.datum_click) as MaxDate
    from target_testing_enc as tm
    group by tm.userid
) tm on t.userid = tm.userid and t.datum_click = MaxDate"""

In [9]:
user_test = pd.read_sql(query, engine)

In [10]:
user_test.head(5)

Unnamed: 0,index,datum_click,anbieter_artikelnummer,userid,clicked_before,pick,days_online_std,days_online_log_std,month_enc,anbietermarktplatz_enc,...,minve_log_std,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,preis_log_std_user,minve_log_std_user
0,3804,2019-06-29 02:28:14,00750627146305D,1677445,[],0.0,-0.152501,0.392636,6,4,...,0.196401,1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
1,179,2019-06-28 23:27:17,00511059GBV-9929BOLSA,777223,[],0.0,-0.870086,-3.604645,6,2,...,0.685555,1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
2,228,2019-06-28 23:29:02,005644296565333017,1656448,"[00477078FL433 MIX, 002274802310, 00739840SE10...",0.0,-0.65131,-0.393863,6,8,...,0.685555,5,"[31, 31, 31, 81, 132, 182, 205, 205, 205, 261,...","[1, 1, 1, 4, 1, 5, 2, 2, 2, 8, 8, 8, 8, 8, 8, ...","[71, 177, 177, 100, 187, 84, 156, 156, 156, 10...","[0.635829993796402, -0.163305881885546, 0.3139...",-0.065385,-0.002668,-0.086234,-0.236297
3,13120,2019-06-29 15:55:52,0034309252726,669575,[],0.0,-0.446536,0.043036,6,1,...,2.154267,3,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
4,412,2019-06-28 23:37:58,0017115812.14.04,1627849,"[00750627160055, 0004335013732, 00217255090,29...",0.0,-0.353775,0.174284,6,6,...,-1.06371,7,"[3, 32, 76, 103, 103, 123, 124, 137, 177, 213,...","[1, 1, 1, 7, 7, 8, 1, 2, 1, 5, 4, 7, 8, 1, 1, ...","[120, 26, 53, 64, 32, 26, 140, 58, 140, 150, 2...","[0.00808942103758454, -0.347738123685122, -0.0...",-0.048112,0.000612,0.214271,0.035174


In [11]:
user_test.drop(['anbieterid_enc', 'anbieter_artikelnummer', 'anbietermarktplatz_enc', 'days_online_std', 
                'days_online_log_std', 'preis_log_std', 'minve_log_std', 
            'index', 'minve_std', 'month_enc', 'preis_std', 'text_vec', 'warengruppe_enc'], axis = 1, inplace = True)

In [12]:
user_test.drop(['pick'], axis = 1, inplace = True)

In [13]:
user_test.head()

Unnamed: 0,datum_click,userid,clicked_before,days_online_log_std,preis_log_std,minve_log_std,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,preis_log_std_user,minve_log_std_user
0,2019-06-29 02:28:14,1677445,[],0.392636,0.610464,0.196401,1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
1,2019-06-28 23:27:17,777223,[],-3.604645,-0.081139,0.685555,1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
2,2019-06-28 23:29:02,1656448,"[00477078FL433 MIX, 002274802310, 00739840SE10...",-0.393863,0.022744,0.685555,5,"[31, 31, 31, 81, 132, 182, 205, 205, 205, 261,...","[1, 1, 1, 4, 1, 5, 2, 2, 2, 8, 8, 8, 8, 8, 8, ...","[71, 177, 177, 100, 187, 84, 156, 156, 156, 10...","[0.635829993796402, -0.163305881885546, 0.3139...",-0.065385,-0.002668,-0.086234,-0.236297
3,2019-06-29 15:55:52,669575,[],0.043036,-1.423027,2.154267,3,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
4,2019-06-28 23:37:58,1627849,"[00750627160055, 0004335013732, 00217255090,29...",0.174284,-0.017632,-1.06371,7,"[3, 32, 76, 103, 103, 123, 124, 137, 177, 213,...","[1, 1, 1, 7, 7, 8, 1, 2, 1, 5, 4, 7, 8, 1, 1, ...","[120, 26, 53, 64, 32, 26, 140, 58, 140, 150, 2...","[0.00808942103758454, -0.347738123685122, -0.0...",-0.048112,0.000612,0.214271,0.035174


In [14]:
len(user_test)

33921

In [15]:
len(user_test.userid.unique())

33867

In [16]:
user_test[user_test.userid == '1571223']

Unnamed: 0,datum_click,userid,clicked_before,days_online_log_std,preis_log_std,minve_log_std,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,preis_log_std_user,minve_log_std_user
744,2019-07-02 15:24:03,1571223,[],1.118607,-0.866595,1.175003,1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
777,2019-07-02 15:24:03,1571223,[],1.098187,-0.75731,1.175003,1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449


In [17]:
user_test.drop_duplicates('userid', inplace = True)

In [23]:
user_test.head()

Unnamed: 0,datum_click,userid,clicked_before,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,preis_log_std_user,minve_log_std_user
0,2019-06-29 02:28:14,1677445,[],1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
1,2019-06-28 23:27:17,777223,[],1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
2,2019-06-28 23:29:02,1656448,"[00477078FL433 MIX, 002274802310, 00739840SE10...",5,"[31, 31, 31, 81, 132, 182, 205, 205, 205, 261,...","[1, 1, 1, 4, 1, 5, 2, 2, 2, 8, 8, 8, 8, 8, 8, ...","[71, 177, 177, 100, 187, 84, 156, 156, 156, 10...","[0.635829993796402, -0.163305881885546, 0.3139...",-0.065385,-0.002668,-0.086234,-0.236297
3,2019-06-29 15:55:52,669575,[],3,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
4,2019-06-28 23:37:58,1627849,"[00750627160055, 0004335013732, 00217255090,29...",7,"[3, 32, 76, 103, 103, 123, 124, 137, 177, 213,...","[1, 1, 1, 7, 7, 8, 1, 2, 1, 5, 4, 7, 8, 1, 1, ...","[120, 26, 53, 64, 32, 26, 140, 58, 140, 150, 2...","[0.00808942103758454, -0.347738123685122, -0.0...",-0.048112,0.000612,0.214271,0.035174


In [24]:
user_test.to_sql('user_enc', engine, index = None, if_exists = 'append')

Training data after

In [25]:
query = """select t.*
from target_training_enc as t
inner join (
    select tm.userid, max(tm.datum_click) as MaxDate
    from target_training_enc as tm
    group by tm.userid
) tm on t.userid = tm.userid and t.datum_click = MaxDate """

In [26]:
user_train = pd.read_sql(query, engine)

In [27]:
user_train.drop_duplicates('userid', inplace = True)

In [28]:
len(user_train)

136965

In [29]:
user_missing = set(user_train.userid).difference(user_test.userid)

In [30]:
len(user_missing)

114630

In [31]:
user_train_fil = user_train[user_train.userid.isin(user_missing)].copy()

In [32]:
len(user_train_fil)

114630

In [33]:
user_train_fil.head()

Unnamed: 0,index,datum_click,anbieter_artikelnummer,userid,clicked_before,pick,days_online_std,days_online_log_std,month_enc,anbietermarktplatz_enc,...,minve_log_std,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,preis_log_std_user,minve_log_std_user
0,1551,2017-09-15 09:09:35,0041299094048,1580778,[],0.0,-0.836832,-1.618013,9,8,...,0.196401,8,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
2,1601,2017-09-15 09:21:20,0034309263653,1546899,[],0.0,2.842107,1.484649,9,1,...,2.311914,7,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
3,1694,2017-09-15 09:38:54,00511059BGV06525,1569657,[],0.0,0.10653,0.597235,9,2,...,0.067817,3,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
4,2087,2017-09-15 10:39:14,00714655009208,1583997,[],0.0,-0.744071,-0.756934,9,7,...,0.685555,1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
5,2218,2017-09-15 10:51:23,0034309263983,829478,[],0.0,3.015377,1.514984,9,1,...,1.175003,2,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449


In [34]:
user_train_fil.drop(['anbieterid_enc', 'anbieter_artikelnummer', 'anbietermarktplatz_enc', 'days_online_std', 
                     'days_online_log_std', 'preis_log_std', 'minve_log_std', 
                    'index', 'minve_std', 'month_enc', 'preis_std', 'text_vec', 'warengruppe_enc', 'pick'], axis = 1, inplace = True)

In [35]:
user_train_fil.head()

Unnamed: 0,datum_click,userid,clicked_before,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,preis_log_std_user,minve_log_std_user
0,2017-09-15 09:09:35,1580778,[],8,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
2,2017-09-15 09:21:20,1546899,[],7,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
3,2017-09-15 09:38:54,1569657,[],3,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
4,2017-09-15 10:39:14,1583997,[],1,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449
5,2017-09-15 10:51:23,829478,[],2,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.807366,-0.011449


In [36]:
user_train_fil.to_sql('user_enc', engine, index = None, if_exists = 'append')

In [None]:
yes