# Statistics Final Training Data

These values show the real distribution, running this sheet will show the distributions of the sample data set

In [1]:
import pandas as pd
import sqlalchemy 
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float
from sqlalchemy import create_engine, MetaData, Table, inspect
import psycopg2
from sqlalchemy.dialects import postgresql
from matplotlib import pyplot as plt 
import sqlite3

In [2]:
#DATABASE_URI = ###
#engine = create_engine(DATABASE_URI)

In [None]:
engine = sqlite3.connect('data/db.db')

## Density

In [3]:
users = pd.read_sql('''SELECT DISTINCT t.userid
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [4]:
items = pd.read_sql('''SELECT DISTINCT t.anbieter_artikelnummer
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [5]:
clicks = pd.read_sql('''SELECT count(*)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [6]:
len(users)

117629

In [7]:
len(items)

647579

In [8]:
clicks

Unnamed: 0,count
0,10019435


## Clicks per User

In [9]:
users = pd.read_sql('''SELECT t.userid, count(*)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL GROUP BY t.userid''', engine)

In [10]:
users.columns = ['userid', 'count']

In [11]:
users.sort_values('count', ascending = False).head(10)

Unnamed: 0,userid,count
112189,864754,10845
91506,566204,10767
8851,1550835,10542
115186,888020,10176
116091,894451,9990
40245,1616603,9635
29578,1601824,9485
108985,835024,9404
44307,1622638,9313
93956,621510,9120


In [12]:
users['count'].describe()

count    117629.000000
mean         85.178272
std         334.455323
min           1.000000
25%           4.000000
50%          12.000000
75%          44.000000
max       10845.000000
Name: count, dtype: float64

Percentage of users with more than 200 clicks

In [13]:
len(users[users['count'] >=200])/len(users)

0.08226712800414863

## Clicks per Item

In [14]:
items = pd.read_sql('''SELECT t.anbieter_artikelnummer, count(*)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL GROUP BY t.anbieter_artikelnummer''', engine)

In [15]:
items.columns = ['anbieter_artikelnummer', 'count']

In [16]:
items.sort_values('count', ascending = False).head(10)

Unnamed: 0,anbieter_artikelnummer,count
229381,0034309263929,3161
226077,0034309244515,2595
224331,0034309216301,2506
224550,0034309218303,2386
331600,00477078FL433 MIX,2264
223915,003430921300,2239
545632,00714655011467,2066
326971,004770784014 MIX,2004
327715,00477078A1940 MIX,1927
542805,00714655004196,1809


In [17]:
items['count'].describe()

count    647579.000000
mean         15.472143
std          34.682728
min           1.000000
25%           2.000000
50%           6.000000
75%          15.000000
max        3161.000000
Name: count, dtype: float64

## Pick-Ratio

In [18]:
picks = pd.read_sql('''SELECT sum(t.pick)
            FROM    target_training_enc as t 
                    LEFT JOIN training_user_filter as f
                        ON t.userid = f.userid
            WHERE   f.userid IS NULL''', engine)

In [19]:
picks

Unnamed: 0,sum
0,2861198.0


Pick-Ratio

In [20]:
picks.values[0][0]/clicks.values[0][0]

0.2855648048018676

In [21]:
pd.read_sql("SELECT * from user_enc where userid = '1698332'", engine)

Unnamed: 0,userid,datum_click,clicked_before,usermkt_enc,anbieterid_enc_user,anbietermarktplatz_enc_user,warengruppe_enc_user,text_vec_user,preis_std_user,minve_std_user,minve_log_std_user,preis_log_std_user
0,1698332,2020-03-17 15:01:41,"[00714655012517, 00714655012272, 0071465501214...",6,"[103, 213, 264, 301, 430, 430, 430, 430, 430, ...","[7, 5, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[139, 96, 221, 105, 119, 204, 107, 88, 88, 88,...","[-0.0835299999347297, -0.139140869329946, -0.4...",-0.030964,-0.006157,-0.021344,0.346582


In [22]:
import pickle

In [23]:
imputer_stueck = pickle.load( open( "data/models/preprocessing/imputer_stueck.pkl", "rb" ) )
imputer_preis = pickle.load( open( "data/models/preprocessing/imputer_preis.pkl", "rb" ) )

In [24]:
imputer_stueck.transform([[None]])

array([[10.45257975]])

In [25]:
imputer_preis.transform([[None]])

array([[12.95303852]])