# Stats Training Data

In [23]:
import pandas as pd
import sqlalchemy 
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float
from sqlalchemy import create_engine, MetaData, Table, inspect
import psycopg2
from sqlalchemy.dialects import postgresql
from matplotlib import pyplot as plt 

In [4]:
DATABASE_URI = 'postgres+psycopg2://postgres:power2020@host.docker.internal:5431/person_recommender'
engine = create_engine(DATABASE_URI)

## Density

In [6]:
users = pd.read_sql('''SELECT DISTINCT t.userid
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [11]:
items = pd.read_sql('''SELECT DISTINCT t.anbieter_artikelnummer
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [12]:
clicks = pd.read_sql('''SELECT count(*)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [9]:
len(users)

117629

In [13]:
len(items)

647579

In [14]:
clicks

Unnamed: 0,count
0,10019435


## Clicks per User

In [19]:
users = pd.read_sql('''SELECT t.userid, count(*)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL GROUP BY t.userid''', engine)

In [27]:
users.sort_values('count', ascending = False)

Unnamed: 0,userid,count
112189,864754,10845
91506,566204,10767
8851,1550835,10542
115186,888020,10176
116091,894451,9990
...,...,...
53803,1636239,1
65457,1652579,1
29877,1602275,1
18802,1585720,1


In [34]:
users['count'].describe()

count    117629.000000
mean         85.178272
std         334.455323
min           1.000000
25%           4.000000
50%          12.000000
75%          44.000000
max       10845.000000
Name: count, dtype: float64

In [42]:
len(users[users['count'] >=200])/len(users)

0.08226712800414863

## Clicks per Item

In [35]:
items = pd.read_sql('''SELECT t.anbieter_artikelnummer, count(*)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL GROUP BY t.anbieter_artikelnummer''', engine)

In [40]:
items.sort_values('count', ascending = False)

Unnamed: 0,anbieter_artikelnummer,count
229381,0034309263929,3161
226077,0034309244515,2595
224331,0034309216301,2506
224550,0034309218303,2386
331600,00477078FL433 MIX,2264
...,...,...
461760,00653973JARBITRE SS MEN 100,1
461775,00653973JARMIC LS MEN 100,1
461781,00653973JARRY SS LADY 100,1
146294,"00217255082,LT08",1


In [39]:
items['count'].describe()

count    647579.000000
mean         15.472143
std          34.682728
min           1.000000
25%           2.000000
50%           6.000000
75%          15.000000
max        3161.000000
Name: count, dtype: float64

## Pickrate

In [15]:
picks = pd.read_sql('''SELECT sum(t.pick)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [16]:
picks

Unnamed: 0,sum
0,2861198.0


In [57]:
pd.read_sql("SELECT * from user_enc where userid = '1698332'", engine)

Unnamed: 0,userid,datum_click,clicked_before,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,minve_log_std_user,preis_log_std_user
0,1698332,2020-01-10 18:39:42,"[0072251517391, 00481348MB-10109, 0069462536.4...",6,"[264, 430, 430, 430, 430, 430, 430, 430, 430, ...","[8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[221, 119, 204, 107, 88, 88, 88, 88, 189, 88, ...","[-0.120298618433514, -0.132995018749339, -0.32...",-0.043119,-0.007618,0.00038,0.357306


In [59]:
import pickle

In [60]:
imputer_stueck = pickle.load( open( "models/preprocessing/imputer_stueck.pkl", "rb" ) )
imputer_preis = pickle.load( open( "models/preprocessing/imputer_preis.pkl", "rb" ) )

In [63]:
imputer_stueck.transform([[None]])

array([[10.45257975]])

In [64]:
imputer_preis.transform([[None]])

array([[12.95303852]])