# Transform Item Features for Model

In [7]:
import pandas as pd
import numpy as np
import pickle 

## Postgres: Create Table item_enc 

In [8]:
import sqlalchemy 
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float
import psycopg2
from sqlalchemy.dialects import postgresql

In [92]:
DATABASE_URI = 'postgres+psycopg2://postgres:*Klavier1@host.docker.internal:5432/recommender'
from sqlalchemy import create_engine, MetaData, Table, inspect
engine = create_engine(DATABASE_URI)

In [93]:
# Connect to already existing db table
meta = MetaData(engine)
item_vec = Table('item_enc', meta, autoload=True)

In [94]:
# if need to delete table
#item_enc.drop(engine)

Create table

In [95]:
meta = MetaData(engine)
item_enc = Table('item_enc', meta,
                 Column('anbieter_artikelnummer', String, primary_key=True),
                 Column('anbieterid_enc', Integer),
                 Column('anbietermarktplatz_enc', Integer),
                 Column('warengruppe_enc', Integer),
                 Column('text_vec', postgresql.ARRAY(Float)),
                 Column('preis_std', Float),
                 Column('minve_std', Float))
item_enc.create()

## Sqlite3: Create Table item_enc 

In [None]:
engine = sqlite3.connect('data/db.db')

In [None]:
c = engine.cursor()

In [None]:
c.execute('''CREATE TABLE item_enc
             ([anbieter_artikelnummer] text PRIMARY_KEY, [anbieterid_enc] integer, [anbietermarktplatz_enc] integer,
             [warengruppe_enc] integer, [text_vec] blob, [preis_std] real, [minve_std] real, [preis_log_std] real, [minve_log_std] real, [erstanlagedatum] datetime  )''')

## Import Item Data

In [79]:
item_data = pd.read_pickle('data/training_data/articles_info_final.pkl')

In [80]:
item_data[-50:]

Unnamed: 0,anbieter_artikelnummer,anbieterID,erstanlageDatum,stueck_pro_ve,anbietermarktplatz,warengruppe,preis_euro,text_vec
749885,00307346101524810,307346.0,2017-02-09 10:22:55+00:00,1.0,DE,Geschirr_5131,3.28,[]
749886,00307346101537022,307346.0,2016-03-11 16:04:41+00:00,1.0,DE,Küchenhelfer_5212,3.09,[]
749906,0030734623044177,307346.0,2018-01-08 07:11:41+00:00,1.0,DE,USB-Zubehör_5011,3.02,[]
750023,00307346101101532,307346.0,2017-02-08 08:08:17+00:00,1.0,DE,Sonstige_5999,73.26,[]
750024,00307346101101533,307346.0,2016-12-09 07:45:29+00:00,1.0,DE,Sonstige_5999,96.54,[]
750043,00089493DEXPO01839,89493.0,2012-07-17 00:30:00+00:00,1.0,PL,,1.580911,"[0.0824216, -0.01688075, -0.98014474, 0.410894..."
750044,00089493DEXPO01845,89493.0,2012-07-17 00:30:00+00:00,1.0,PL,,1.20799,"[0.0824216, -0.01688075, -0.98014474, 0.410894..."
750045,00089493DEXPO01846,89493.0,2012-07-17 00:30:00+00:00,1.0,PL,,1.20799,"[0.0824216, -0.01688075, -0.98014474, 0.410894..."
750047,0001017627059,10176.0,2019-05-15 07:54:56+00:00,1.0,DE,USB-Zubehör_5011,4.54,"[0.08316308, 0.818199, -0.5049601, -0.29932803..."
750048,00672060CD-21-211,672060.0,2016-06-24 16:12:45+00:00,1.0,ES,,4.38,"[0.1761132, 0.29577896, 0.03661161, 1.002632, ..."


In [12]:
len(item_data)

749032

In [13]:
sum(pd.isna(item_data.erstanlageDatum))

0

In [6]:
#item_data = item_data[:10000]

## Train and Save Imputer, Encoder, and StandardScaler

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

### Categorical Data

anbieterID, anbietermarktplatz, warengruppe

In [16]:
# anbieterID
label_enc_id = LabelEncoder()

#Calling methods on our OneHotEncoder object
label_enc_id.fit( item_data.anbieterID.values) 
pickle.dump(label_enc_id, open("data/models/preprocessing/label_anbieterID.pkl", "wb"))


In [17]:
# anbietermarktplatz
label_enc_mkt = LabelEncoder()

#Calling methods on our OneHotEncoder object
label_enc_mkt.fit( item_data.anbietermarktplatz.values) 
pickle.dump(label_enc_mkt, open("data/models/preprocessing/label_mkt.pkl", "wb"))


In [18]:
# warengruppe
item_data.loc[:,'warengruppe'] = item_data.warengruppe.fillna("NN")
label_enc_wg = LabelEncoder()

#Calling methods on our OneHotEncoder object
label_enc_wg.fit( item_data.warengruppe.values) 
pickle.dump(label_enc_wg, open("data/models/preprocessing/label_warengruppe.pkl", "wb"))


### Continouus Data

stueck_pro_ve, preis_euro - realized that i needed to delete more outliers, trained new transformers in 02_2

In [192]:
# stueck_pro_ve
imputer_stueck = SimpleImputer (strategy = "mean")

imputer_stueck.fit(item_data[['stueck_pro_ve']])#.values.reshape(-1.1))
pickle.dump(imputer_stueck, open("data/models/preprocessing/imputer_stueck.pkl", "wb"))
transformed_stueck = imputer_stueck.transform(item_data[['stueck_pro_ve']])

In [193]:
std_stueck = StandardScaler()

std_stueck.fit(transformed_stueck)
pickle.dump(std_stueck, open("data/models/preprocessing/scaler_stueck.pkl", "wb"))
#transformed_cont = std.transform(transformed_cont)

In [194]:
# preis_euro
imputer_preis = SimpleImputer (strategy = "mean")

imputer_preis.fit(item_data[['preis_euro']])#.values.reshape(-1.1))
pickle.dump(imputer_preis, open("data/models/preprocessing/imputer_preis.pkl", "wb"))
transformed_preis = imputer_preis.transform(item_data[['preis_euro']])

In [195]:
std_preis = StandardScaler()

std_preis.fit(transformed_preis)
pickle.dump(std_preis, open("data/models/preprocessing/scaler_preis.pkl", "wb"))
#transformed_cont = std.transform(transformed_cont)

## Pipeline

In [67]:
test = item_data[0:1000].reset

In [68]:
test

Unnamed: 0,anbieter_artikelnummer,anbieterID,erstanlageDatum,stueck_pro_ve,anbietermarktplatz,warengruppe,preis_euro,text_vec
0,0000400435550,4004.0,2016-03-04 14:39:47+00:00,12.0,DE,Reiseartikel_5199,2.10,"[0.7984229, 0.09205151, -0.30848747, 0.669923,..."
1,000040053900017-002,4005.0,2018-07-09 12:51:10+00:00,1.0,DE,sonstige Taschen_5202,59.95,"[0.19540575, -0.07194629, -0.34633487, -0.2459..."
2,0000403433309,4034.0,2018-11-08 13:04:42+00:00,12.0,DE,Kerzen & Kerzenhalter_5107,2.79,"[0.5111147, 0.018874569, 0.22680217, 0.7675127..."
3,0000403434036,4034.0,2016-11-29 15:31:47+00:00,20.0,DE,Lichterketten_5114,2.10,"[0.34992826, -0.059115075, -0.25212124, -0.110..."
4,00004034LB-03,4034.0,2017-11-15 14:27:01+00:00,10.0,DE,Lichterketten_5114,1.89,"[0.13192588, -0.19249268, -0.3639368, -0.47083..."
...,...,...,...,...,...,...,...,...
1013,0035042810056417,350428.0,2014-02-12 15:28:41+00:00,1.0,DE,Figuren & Skulpturen_5105,23.95,"[0.28069854, -1.3078563, -0.374014, -0.2468079..."
1014,0035042810056811,350428.0,2012-06-14 00:30:00+00:00,48.0,DE,Figuren & Skulpturen_5105,0.50,"[-0.109327234, -0.4787696, -0.18124016, 0.7672..."
1015,0035042810056982,350428.0,2017-07-31 09:51:02+00:00,8.0,DE,Figuren & Skulpturen_5105,1.85,"[-0.36003414, -0.84029263, -0.18818657, 1.0293..."
1016,0035042810080901,350428.0,2018-07-02 09:14:17+00:00,12.0,DE,Lampen _5112,0.99,"[-0.2594132, -0.6955608, -0.18439299, 0.632958..."


In [88]:
# load all transformers
# cat
label_enc_id = pickle.load( open( "data/models/preprocessing/label_anbieterID.pkl", "rb" ) )
label_enc_mkt = pickle.load( open( "data/models/preprocessing/label_mkt.pkl", "rb" ) )
label_enc_wg = pickle.load( open( "data/models/preprocessing/label_warengruppe.pkl", "rb" ) )
# cont
imputer_stueck = pickle.load( open( "data/models/preprocessing/imputer_stueck.pkl", "rb" ) )
imputer_preis = pickle.load( open( "data/models/preprocessing/imputer_preis.pkl", "rb" ) )
std_stueck = pickle.load( open( "data/models/preprocessing/scaler_stueck.pkl", "rb" ) )
std_preis = pickle.load( open( "data/models/preprocessing/scaler_preis.pkl", "rb" ) )


In [89]:
def transform_data(df):
    # fill empty warengruppe
    #warengruppe = df.warengruppe.fillna("NN")
    df1 = pd.DataFrame({'anbieter_artikelnummer': df.anbieter_artikelnummer})
    # impute and scale preis and stueck_ve
    df1["minVE_std"] = imputer_stueck.transform(df[['stueck_pro_ve']])
    df1["preis_std"] = imputer_preis.transform(df[['preis_euro']])
    df1["minVE_std"] = std_stueck.transform(df1[['minVE_std']])
    df1["preis_std"] = std_preis.transform(df1[['preis_std']])
    # Create one-hot-encodings anbieterID, anbietermarktplatz, warengruppe
    anbieterID = (label_enc_id.transform(df['anbieterID']) + 1).tolist()
    anbietermarktplatz = (label_enc_mkt.transform(df['anbietermarktplatz']) + 1).tolist()
    warengruppe = (label_enc_wg.transform(df.warengruppe.fillna("NN").values) + 1).tolist()
    text_vec_list = []
    # Change text_vec to list
    for n in range(len(df.text_vec)):
        try:
            ls = df.text_vec[n].tolist()
        except: 
            ls = [0] * 150
        finally:
            text_vec_list.append(ls)
    # combine data to second df
    df2 = pd.DataFrame({"anbieter_artikelnummer": df.anbieter_artikelnummer,
                        'text_vec': text_vec_list, 
                        "anbieterID_enc" : anbieterID, 
                        "anbietermarktplatz_enc": anbietermarktplatz, 
                        "warengruppe_enc": warengruppe})
    df = pd.merge(df1, df2, how = "left", on = "anbieter_artikelnummer")
    return(df)

In [90]:
transform_data(item_data[-50:].reset_index())

Unnamed: 0,anbieter_artikelnummer,minVE_std,preis_std,text_vec,anbieterID_enc,anbietermarktplatz_enc,warengruppe_enc
0,00307346101524810,-0.015527,-0.099236,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",165,1,60
1,00307346101537022,-0.015527,-0.101185,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",165,1,112
2,0030734623044177,-0.015527,-0.101903,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",165,1,213
3,00307346101101532,-0.015527,0.618689,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",165,1,189
4,00307346101101533,-0.015527,0.857519,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",165,1,189
5,00089493DEXPO01839,-0.015527,-0.116667,"[0.08242160081863403, -0.01688075065612793, -0...",56,8,140
6,00089493DEXPO01845,-0.015527,-0.120492,"[0.08242160081863403, -0.01688075065612793, -0...",56,8,140
7,00089493DEXPO01846,-0.015527,-0.120492,"[0.08242160081863403, -0.01688075065612793, -0...",56,8,140
8,0001017627059,-0.015527,-0.086309,"[0.08316308259963989, 0.8181989789009094, -0.5...",13,1,213
9,00672060CD-21-211,-0.015527,-0.087951,"[0.17611320316791534, 0.29577895998954773, 0.0...",407,2,140


## Transform and Save Data in DB

use batches in order to make manageable

In [96]:
batches_start = list(range(0, len(item_data), 50000))
batches_end = batches_start[1:]
batches_end.append(len(item_data))

for i,v in zip(batches_start, batches_end):
    transformed = transform_data(item_data[i:v].reset_index())
    # save data
    transformed.to_sql('item_enc', engine, index = False, if_exists = 'append') # later need to check first if already in DB

In [97]:
transformed

Unnamed: 0,anbieter_artikelnummer,minVE_std,preis_std,text_vec,anbieterID_enc,anbietermarktplatz_enc,warengruppe_enc
0,007626506GMB424TA,-0.007314,-0.093696,"[0.4767080247402191, -0.5929063558578491, -0.3...",535,4,37
1,007651155473,0.022253,-0.126217,"[-0.9195021390914917, 0.16394507884979248, 0.2...",536,1,41
2,00766429WL9007,-0.015527,-0.105699,"[-0.37057003378868103, 0.6140706539154053, -0....",537,2,1
3,00767595128385D,0.022253,-0.115958,"[-0.26641350984573364, -0.5146241188049316, -0...",540,4,32
4,00767595142733,0.022253,-0.115958,"[0.13977789878845215, -0.7546696066856384, -0....",540,4,32
...,...,...,...,...,...,...,...
49027,00005599DY013/M6/53,-0.015527,-0.062611,"[1.145871877670288, 0.2456214874982834, -0.272...",7,1,166
49028,005644296575234549,-0.002386,-0.125499,"[0.6510516405105591, -0.7516795992851257, 0.11...",317,8,186
49029,0075090114865606,-0.008956,-0.105186,"[-0.24635718762874603, -0.6695142388343811, -0...",518,1,14
49030,007626501722542,-0.013884,0.077937,"[0.7181375026702881, 0.12319134175777435, 0.28...",535,4,101


Test if extraction works - yes