# Transform Training Data for Modelling

This notebook transforms the training data to the format needed in training. It saves the final data to the data base

In [2]:
import pandas as pd
import numpy as np
import pickle 
from datetime import timedelta
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import sqlalchemy 
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float
from sqlalchemy import create_engine, MetaData, Table, inspect
import psycopg2
from sqlalchemy.dialects import postgresql
import datetime
import sqlite3

In [29]:
#DATABASE_URI = ###
#engine = create_engine(DATABASE_URI)
engine = sqlite3.connect('data/db.db')
c = engine.cursor()

### Postgres: Create Table for Result

In [8]:
meta = MetaData(engine)
target_training_enc = Table('target_training_enc', meta,
                     # info
                 Column('index', Integer, primary_key=True),
                 Column('datum_click', DateTime),
                 Column('anbieter_artikelnummer', String),
                 Column('userid', String),
                 Column('clicked_before', postgresql.ARRAY(String)),
                     # target
                 Column('pick', Float),
                     # context
                 Column('days_online_std', Float),
                 Column('month_enc', Integer),
                     # item
                 Column('anbietermarktplatz_enc', Integer),
                 Column('anbieterid_enc', Integer),
                 Column('warengruppe_enc', Integer),
                 Column('text_vec', postgresql.ARRAY(Float)),
                 Column('preis_std', Float),
                 Column('minve_std', Float), 
                     # user
                 Column('usermkt_enc', Integer),
                 Column('anbieterid_enc_user', postgresql.ARRAY(Integer)),
                 Column('anbietermarktplatz_enc_user', postgresql.ARRAY(Integer)),
                 Column('warengruppe_enc_user', postgresql.ARRAY(Integer)),
                 Column('text_vec_user', postgresql.ARRAY(Float)),
                 Column('preis_std_user', Float),
                 Column('minve_std_user', Float))
target_training_enc.create()

In [30]:
meta = MetaData(engine)
item_enc = Table('item_enc', meta, autoload=True)
target_training_enc = Table('target_training_enc', meta, autoload=True)

### Sqlite3: Create Table for Results

In [None]:
c.execute('''CREATE TABLE target_training_enc
             ([index] integer PRIMARY KEY, [datum_click] datetime, [anbieter_artikelnummer] text, 
             [userid] text, [clicked_before] blob, [pick] integer, [days_online_std] real, [month_enc] integer, 
             [anbietermarktplatz_enc] integer, [anbieterid_enc] integer, [warengruppe_enc] integer, [text_vec] blob, 
             [preis_std] real, [minve_std] real, [usermkt_enc] integer, [anbieterid_enc_user] blob, 
             [anbietermarktplatz_enc_user] blob, [warengruppe_enc_user] blob, [text_vec_user] blob, [preis_std_user] real, 
             [minve_std_user] real, [days_online_log_std] real, [preis_log_std] real, [preis_log_std_user] real, 
             [minve_log_std] real, [minve_log_std_user] real)''')

### Train and save scaler for days online

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [14]:
std_days_online = StandardScaler()

std_days_online.fit( data.days_online.values.reshape(-1,1))
pickle.dump(std_days_online, open("data/models/preprocessing/scaler_days_online.pkl", "wb"))
#transformed_cont = std.transform(transformed_cont)

# Full Pipeline

Run in batches and save data to db

In [5]:
data = pd.read_pickle('data/training_data/target_train_done.pkl')

In [18]:
len(data)

11264404

In [26]:
data_new = data.reset_index()

In [27]:
data_new.head()

Unnamed: 0,index,userID,anbieter_artikelnummer,datum_click,pick,days_online,month,erstRegMarktplatz
0,7150000,871340,00373548RL-CUT24_2,2018-11-14 12:55:25+00:00,0.0,193.0,11,ES
1,7150001,879456,0001017612096,2018-11-14 12:55:27+00:00,0.0,197.0,11,NL
2,7150002,1623689,0034309260767,2018-11-14 12:55:28+00:00,1.0,2646.0,11,FR
3,7150003,1625292,00477078EM12,2018-11-14 12:55:30+00:00,0.0,54.0,11,EU
4,7150004,1601824,00690052k858BRR,2018-11-14 12:55:32+00:00,1.0,40.0,11,FR


In [28]:
len(data_new)

4114404

Funtions to 
- encode the data in log
- extract the item from db
- extract item data to previous clicks from db

In [31]:
def transform_log(df):
    df1 = pd.DataFrame({"index" : df['index'], 
                        'datum_click' : df.datum_click, 
                        'anbieter_artikelnummer': df.anbieter_artikelnummer,
                       'userID' : df.userID, 
                      'month_enc' : df.month})
    df1['days_online_std'] = std_days_online.transform(df[['days_online']])
    #df1['month_enc'] = (label_enc_month.transform(df['month']) + 1).tolist()
    df1['userMkt_enc'] = (label_enc_mkt.transform(df.erstRegMarktplatz.fillna("EU").values) + 1 ).tolist() # not doo in real, already in userTable
    df1["pick"] = df.pick
    return(df1)

def get_item_and_user (df):
    # item
    item =  pd.read_sql("SELECT * from item_enc WHERE anbieter_artikelnummer = %s", engine , params = (df.anbieter_artikelnummer,))
    #item = item.fillnan(value = 0)
    df_item = pd.merge(df, item, how = 'left', on = 'anbieter_artikelnummer')
    df_item = df_item.dropna() # filter all rows without item-embedding
    # user data based on last clicks 
    # usually first need to get the user data (MKT & last clicked from db)
    if len(df.clicked_before.values[0]) > 0:
        items_clicked = engine.execute(sqlalchemy.select([item_enc]).where(item_enc.c.anbieter_artikelnummer.in_(df.clicked_before.values[0])))
        # reformat item data of user
        result_clicked = []
        for row in items_clicked:
            result_clicked.append(row)
        user_detail = pd.DataFrame(result_clicked)
        if len(user_detail) > 0:
            user_detail.columns = items_clicked.keys()
            # make list
            anbieterID_enc = user_detail.anbieterID_enc.values.tolist()
            anbietermarktplatz_enc = user_detail.anbietermarktplatz_enc.values.tolist()
            warengruppe_enc = user_detail.warengruppe_enc.values.tolist()
            text_vec = np.array((user_detail.text_vec).values.tolist()[-50:]).mean(axis = 0).tolist() ## only use last 50 !
            preis_std = np.array((user_detail.preis_std).values.tolist()).mean(axis = 0)
            minVE_std = np.array((user_detail.minVE_std).values.tolist()).mean(axis = 0)
            user = pd.DataFrame({'userID' : df.userID, 
                             'anbieterID_enc' : [anbieterID_enc], 
                             'anbietermarktplatz_enc' : [anbietermarktplatz_enc], 
                             'warengruppe_enc' : [warengruppe_enc], 
                             'text_vec' : [text_vec],
                             'preis_std' : preis_std, 
                             'minVE_std' : minVE_std})
        else:
            user = pd.DataFrame({'userID' : df.userID, 
                             'anbieterID_enc' : [[]], 
                             'anbietermarktplatz_enc' : [[]], 
                             'warengruppe_enc' : [[]], 
                             'text_vec' : [[0] * 150],
                             'preis_std' : 0, 
                             'minVE_std' : 0})
    else:
        user = pd.DataFrame({'userID' : df.userID, 
                         'anbieterID_enc' : [[]], 
                         'anbietermarktplatz_enc' : [[]], 
                         'warengruppe_enc' : [[]], 
                         'text_vec' : [[0] * 150],
                         'preis_std' : 0, 
                         'minVE_std' : 0})
        
    df_return = pd.merge(df_item, user, how = "left", on = "userID", suffixes = ("", "_user"))
    return (df_return)

### Batch processing: Apply funtions to log and save data to db

In [None]:
now = datetime.datetime.now()
print ("Start time : ")
print (now.strftime("%Y-%m-%d %H:%M:%S"))
label_enc_mkt = pickle.load( open( "data/models/preprocessing/label_mkt.pkl", "rb" ) )
label_enc_month = pickle.load (open( "data/models/preprocessing/label_month.pkl", "rb" ) )
std_days_online = pickle.load (open( "data/models/preprocessing/scaler_days_online.pkl", "rb" ) )

batches_start = list(range(0, len(data_new), 50000)) # data
batches_end = batches_start[1:]
batches_end.append(len(data_new)) # data


for i,v in zip(batches_start, batches_end):
    
    # Transform log data
    data_tr = transform_log(data_new[i:v]).reset_index(drop = True)
    
    # Create user_clicks
    clicked_before = []
    for r in range(len(data_tr)):
        clicked = data[(data.userID == data_tr.userID[r]) & (data.datum_click < (data_tr.datum_click[r] - timedelta(1)))]
        clicked_before.append(clicked.anbieter_artikelnummer.values.tolist()[-200:])
    data_tr['clicked_before'] = clicked_before
    
    # Get user & item infos 
    list_df =[]
    for n in range(len(data_tr)):
        transformed = get_item_and_user(data_tr[n:n+1]) # data_tr
        list_df.append(transformed)
    final_df = pd.concat(list_df, sort = True)
    final_df.to_sql('target_training_enc', engine, index = False, if_exists = 'append')
    now = datetime.datetime.now()
    print ("End time : ", v)
    print (now.strftime("%Y-%m-%d %H:%M:%S"))

Start time : 
2019-11-21 08:15:27
End time :  50000
2019-11-21 09:29:58
End time :  100000
2019-11-21 10:45:52
End time :  150000
2019-11-21 12:02:26
End time :  200000
2019-11-21 13:17:49
End time :  250000
2019-11-21 14:34:42
End time :  300000
2019-11-21 15:51:37
End time :  350000
2019-11-21 17:08:38
End time :  400000
2019-11-21 18:22:50
End time :  450000
2019-11-21 19:38:02
End time :  500000
2019-11-21 20:52:34
End time :  550000
2019-11-21 22:09:09
End time :  600000
2019-11-21 23:24:05
End time :  650000
2019-11-22 00:40:53
End time :  700000
2019-11-22 01:56:05
End time :  750000
2019-11-22 03:12:13
End time :  800000
2019-11-22 04:26:24
End time :  850000
2019-11-22 05:42:12
End time :  900000
2019-11-22 06:57:40
End time :  950000
2019-11-22 08:13:17
End time :  1000000
2019-11-22 09:32:01
End time :  1050000
2019-11-22 10:50:03
End time :  1100000
2019-11-22 12:07:24
End time :  1150000
2019-11-22 13:25:48
End time :  1200000
2019-11-22 14:43:41
End time :  1250000
2019-1