# Create Target Table

- join clicks and pick table in order to receive customer "rating" feedback concerning a product
- all items that are picked by the user = added to the shopping cart, are rated positive, the others negative
- divide data in training and testing data

In [1]:
import pandas as pd 
import numpy as np 
import pickle
import feather

# Clicks

A click is measured when a user clicks on a product on a listing page - it leads the user to the product detail page

In [4]:
clicks = feather.read_dataframe("data/raw_data/artikelklicks_current.feather")

In [5]:
clicks.head()

Unnamed: 0,userID,datum,artikelID,anbieterID,marktplatz,artikelnummer,anbieter_artikelnummer
0,1575009,2017-09-15 00:00:03+00:00,15649375,144119,IT,AHQ1067,00144119AHQ1067
1,804304,2017-09-15 00:00:04+00:00,15132316,714655,FR,006146,00714655006146
2,1563645,2017-09-15 00:00:07+00:00,14577286,682864,PL,,
3,1575009,2017-09-15 00:00:17+00:00,12947166,144119,IT,EP0157,00144119EP0157
4,498808,2017-09-15 00:00:19+00:00,10633528,513388,DE,D712 N,00513388D712 N


In [28]:
len(clicks) 

22182169

In [42]:
max(clicks.datum)

Timestamp('2019-09-16 04:01:01+0000', tz='UTC')

In [30]:
len(clicks.userID.unique())

150851

get unique combinations of userID and artikelID

In [31]:
len(clicks.groupby(['userID','artikelID']).size().reset_index())


14244393

get unique combinations of userID and anbieter_artikelnummer - a little more general than artikelID

In [32]:
len(clicks.groupby(['userID','anbieter_artikelnummer']).size().reset_index())


13269364

In [6]:
clicks.datum = pd.to_datetime(clicks.datum)

Only include each item and user combination once during training

In [7]:
clicks_unique = clicks.groupby(['userID','anbieter_artikelnummer'])['datum', 'anbieterID'].min().reset_index()

In [8]:
clicks_unique.head()

Unnamed: 0,userID,anbieter_artikelnummer,datum,anbieterID
0,0,,2018-07-03 15:25:52+00:00,10176
1,0,0000400435550,2018-10-28 15:52:32+00:00,4004
2,0,000040053900017-002,2019-04-21 17:59:46+00:00,4005
3,0,0000403433309,2018-11-08 13:10:58+00:00,4034
4,0,0000403434036,2018-10-22 20:39:31+00:00,4034


# Picks

a pick is the target variable of this model - it signals when the user adds a product to the shopping cart

In [9]:
picks = pd.read_csv("raw_data/artikelpicks.csv", sep = ";", encoding = "unicode_escape")
picks.head()

Unnamed: 0.1,Unnamed: 0,userID,datum,artikelID,anbieterID,marktplatz,artikelnummer,anbieter_artikelnummer
0,1,1575009,2017-09-15 02:00:00,15649375,144119,IT,AHQ1067,00144119AHQ1067
1,2,804304,2017-09-15 02:00:00,15132316,714655,FR,006146,00714655006146
2,3,1575009,2017-09-15 02:00:00,12947166,144119,IT,EP0157,00144119EP0157
3,4,1583984,2017-09-15 02:00:00,11202369,639793,FR,RT048,00639793RT048
4,5,1575009,2017-09-15 02:00:00,13213179,144119,IT,NH0373,00144119NH0373


In [10]:
len(picks)

4159338

In [44]:
max(picks.datum)

Timestamp('2019-09-16 02:00:00')

In [13]:
# number of unique user and item combinations!
len(picks.groupby(['userID','anbieter_artikelnummer']).size().reset_index())


3369708

In [10]:
picks.datum = pd.to_datetime(picks.datum, utc = True)


In [12]:
picks.head()

Unnamed: 0.1,Unnamed: 0,userID,datum,artikelID,anbieterID,marktplatz,artikelnummer,anbieter_artikelnummer
0,1,1575009,2017-09-15 02:00:00+00:00,15649375,144119,IT,AHQ1067,00144119AHQ1067
1,2,804304,2017-09-15 02:00:00+00:00,15132316,714655,FR,006146,00714655006146
2,3,1575009,2017-09-15 02:00:00+00:00,12947166,144119,IT,EP0157,00144119EP0157
3,4,1583984,2017-09-15 02:00:00+00:00,11202369,639793,FR,RT048,00639793RT048
4,5,1575009,2017-09-15 02:00:00+00:00,13213179,144119,IT,NH0373,00144119NH0373


In [53]:
len(picks.groupby(['userID','anbieter_artikelnummer', 'date'])['datum'].min().reset_index())

3898586

picks summarized on user and anbieter_artikelnummer - ready for outer-join with clicks

In [11]:
picks_unique = picks.groupby(['userID','anbieter_artikelnummer'])['datum', 'anbieterID'].min().reset_index()
picks_unique['pick'] = 1

In [12]:
print(len(picks_unique))
picks_unique.head()

3369708


Unnamed: 0,userID,anbieter_artikelnummer,datum,anbieterID,pick
0,0,000040051900052-007,2018-09-10 02:00:00+00:00,4005,1
1,0,000040053600123-001,2019-07-24 02:00:00+00:00,4005,1
2,0,000040053600123-003,2019-07-24 02:00:00+00:00,4005,1
3,0,000040053600123-004,2019-07-24 02:00:00+00:00,4005,1
4,0,000040053600124-002,2019-07-24 02:00:00+00:00,4005,1


# Merge clicks and picks as outer join on userID, anbieter_artikelnummer

In [27]:
len(clicks_unique)

13269364

In [28]:
len(picks_unique)

3369708

In [29]:
target = pd.merge(clicks_unique, picks_unique, how = "outer", on = ["userID", "anbieter_artikelnummer", "anbieterID"], suffixes = ("_click", "_pick")) 

In [6]:
(target.sample(100))

Unnamed: 0,userID,anbieter_artikelnummer,datum_click,anbieterID,datum_pick,pick
9387551,1621499,00757137WPEH3000,2018-07-27 23:20:42+00:00,757137,NaT,0.0
11613690,1651303,0022024520306,2019-01-24 01:46:32+00:00,220245,NaT,0.0
7233788,1593139,00043137YPM_404C_N,2019-07-22 23:05:52+00:00,43137,NaT,0.0
4840248,1543713,00382446SHQ1516,2018-08-15 19:54:40+00:00,382446,2018-08-15 02:00:00+00:00,1.0
6767215,1586861,00694944DK1731,2017-10-10 15:51:32+00:00,694944,NaT,0.0
...,...,...,...,...,...,...
8806909,1612603,00741889BDC-03,2018-07-15 22:05:13+00:00,741889,2018-07-15 02:00:00+00:00,1.0
5391766,1559225,00282370112788,2018-02-07 03:45:24+00:00,282370,NaT,0.0
492873,466985,00382446QEL673375,2019-08-16 23:09:53+00:00,382446,2019-08-16 02:00:00+00:00,1.0
174664,243292,0044709935976/OSB-17131,2019-06-13 20:41:10+00:00,447099,NaT,0.0


In [31]:
target.pick.sum() # works !

3369708.0

Fill NAs: 
- all picks that are not 1 need to be 0
- all rows that do not have a click date need to fill the click that with pick date (some items were picked without looking on the item detail page)


In [32]:
target.fillna(value = {'pick': 0}, inplace = True)
target.loc[pd.isnull(target.datum_click) == True, 'datum_click'] = target.datum_pick
#target.loc[target.datum_click == 0, 'datum_click'] = target.datum_pick
target.sample(100)

Unnamed: 0,userID,anbieter_artikelnummer,datum_click,anbieterID,datum_pick,pick
11146958,1642242,00694944KC2319,2019-02-25 17:23:32+00:00,694944,NaT,0.0
12517124,1663737,0044709944252/SN47198,2019-05-12 21:04:20+00:00,447099,NaT,0.0
9895208,1625470,007297418939,2018-08-17 21:33:37+00:00,729741,NaT,0.0
294946,355831,00745923554724-104,2018-08-02 15:15:57+00:00,745923,NaT,0.0
9430293,1622410,00665748600316,2018-11-03 17:48:55+00:00,665748,2018-11-03 01:00:00+00:00,1.0
...,...,...,...,...,...,...
54101,75493,00756467262683,2019-07-04 12:18:28+00:00,756467,2019-07-04 02:00:00+00:00,1.0
10155664,1627948,0043590002_uomo-0825,2018-09-02 02:28:18+00:00,435900,NaT,0.0
74742,125549,007651155590,2018-11-20 21:47:08+00:00,765115,NaT,0.0
5663609,1562280,00624991APS-380-0587,2018-08-14 20:33:19+00:00,624991,2018-08-14 02:00:00+00:00,1.0


In [7]:
target.to_pickle("data/processed_data/target_data_all.pkl")

# Analyse Target Data

How many differnt articles, users, how often do they appear

In [3]:
target = pd.read_pickle("data/processed_data/target_data_all.pkl")

In [30]:
len(target)

13289785

In [8]:
target_top = target.head(1000)

In [5]:
# Users for training
target['userID'].nunique()


151021

In [40]:
# Articles for training
target['anbieter_artikelnummer'].nunique()


756614

In [41]:
# number of positive feedback
target.pick.sum()

3369708.0

In [42]:
# pick-ratio
target.pick.sum()/len(target)

0.2535562463952577

## Users

find ouliers and filter training data

In [10]:
target_summary_user = target.groupby(['userID'])['datum_pick', 'pick'].count().reset_index()

In [11]:
target_summary_user.columns = ['userID', 'picks', 'clicks']
target_summary_user.describe()

Unnamed: 0,userID,picks,clicks
count,151021.0,151021.0,151021.0
mean,1328137.0,22.312844,87.999583
std,458242.6,159.549461,1450.062766
min,0.0,0.0,1.0
25%,854914.0,0.0,3.0
50%,1595791.0,0.0,9.0
75%,1642487.0,4.0,36.0
max,1689091.0,13028.0,271289.0


In [50]:
# users with only one click
(sum(target_summary_user.clicks ==1))

21105

14% of users only made one click

In [51]:
(sum(target_summary_user.clicks ==1))/len(target_summary_user)

0.13974877666019958

In [29]:
target_summary_user.sort_values(by=['picks'], ascending = False).head(10)

Unnamed: 0,userID,picks,clicks
63714,1576169,13028,19094
59266,1562316,10795,13726
45291,902588,10709,17566
74260,1593360,10487,19381
75308,1595551,9912,14003
82220,1604194,8479,18767
16225,621510,8107,9257
86569,1609410,7849,9125
43923,894451,7843,11008
98920,1624850,7545,8609


Many of these must be hackers that are scraping our website... no one clicks this much an never picks

In [None]:
hackers = target_summary_user.sort_values(by=['clicks'], ascending = False).head(50)
hackers

should filter out - only top 9 is suspicious

### Clean and Filter Users

In [22]:
# look at obvious hackers = thousands of clicks - no picks... not usuall behavior 
hackers_only = hackers[hackers['picks']==0]

In [26]:
userIDs_filter = hackers_only.userID.tolist()

In [28]:
target_filtered = target[~target.userID.isin(userIDs_filter)]

In [29]:
len(target_filtered) #13289785 before

12573323

## Articles

In [32]:
target_summary_article = target.groupby(['anbieter_artikelnummer'])['datum_pick', 'pick'].count().reset_index()

In [33]:
target_summary_article.columns = ['anbieter_artikelnummer', 'picks', 'clicks']
target_summary_article.describe()

Unnamed: 0,picks,clicks
count,756614.0,756614.0
mean,4.453669,17.564815
std,12.158631,75.573758
min,0.0,1.0
25%,0.0,3.0
50%,1.0,7.0
75%,4.0,18.0
max,1147.0,57328.0


In [34]:
(sum(target_summary_article.clicks ==1))

91878

In [35]:
(sum(target_summary_article.clicks ==1))/len(target_summary_article)

0.12143312177675802

In [46]:
target_summary_article.sort_values(by=['clicks'], ascending = False).head(10)

Unnamed: 0,anbieter_artikelnummer,picks,clicks
0,,0,57328
263299,0034309263929,1070,3697
259787,0034309244515,1147,2935
257897,0034309216301,809,2796
257421,003430921300,1047,2529
380655,00477078FL433 MIX,628,2504
258119,0034309218303,725,2487
628700,00714655011467,440,2468
375451,004770784014 MIX,606,2141
260433,0034309248681,811,2116


In [42]:
useless_articles = ['']

### Clean and filter articles

In [48]:
target_filtered = target_filtered[~target_filtered.anbieter_artikelnummer.isin(useless_articles)]

In [49]:
len(target_filtered) # 12573323

12516004

# Filtered Dataset - final for now 27.9.19

In [50]:
target_filtered.to_pickle("data/processed_data/target_data_filtered.pkl")

# Users & Items

extract userIDs that need embeddings

In [52]:
target_filtered['userID'].nunique()

148956

In [56]:
users = target_filtered['userID'].unique()

In [54]:
target_filtered['anbieter_artikelnummer'].nunique()

750206

In [57]:
articles = target_filtered['anbieter_artikelnummer'].unique()

In [58]:
with open('data/processed_data/users_list.pkl', 'wb') as f:
    pickle.dump(users, f)
    f.close()

In [59]:
with open('data/processed_data/articles_list.pkl', 'wb') as f:
    pickle.dump(articles, f)
    f.close()

# Train and Test Split

Create train and test split based on time - first 90% of data is training, the last 10% is testing data

In [74]:
target_filtered = target_filtered.sort_values(by='datum_click')

In [75]:
test_size = len(target_filtered)*0.1

In [76]:
test_size

1251600.4000000001

In [77]:
target_test = target_filtered[-1251600:]

In [78]:
len(target_test)

1251600

In [79]:
target_train = target_filtered[:-1251600]

In [80]:
len(target_train)

11264404

In [81]:
len(target_train)+len(target_test) == len(target_filtered)

True

In [82]:
target_test.sort_values(by='datum_click')

Unnamed: 0,userID,anbieter_artikelnummer,datum_click,anbieterID,datum_pick,pick
12992867,1677343,0070854200002534,2019-06-28 21:18:42+00:00,708542,2019-07-01 02:00:00+00:00,1.0
12995886,1677430,00100620101051000097,2019-06-28 21:18:45+00:00,100620,NaT,0.0
12003002,1657316,00147265eG6260A,2019-06-28 21:18:48+00:00,147265,NaT,0.0
12109054,1657316,006589155907619815092,2019-06-28 21:18:56+00:00,658915,NaT,0.0
5760844,1564925,00631847DD47563,2019-06-28 21:19:03+00:00,631847,2019-06-28 02:00:00+00:00,1.0
...,...,...,...,...,...,...
2519419,826409,00382446PNN91101,2019-09-16 03:57:46+00:00,382446,2019-09-16 02:00:00+00:00,1.0
2519420,826409,00382446PNN91102,2019-09-16 03:59:17+00:00,382446,2019-09-16 02:00:00+00:00,1.0
2519417,826409,00382446PNN91033,2019-09-16 03:59:31+00:00,382446,2019-09-16 02:00:00+00:00,1.0
2519568,826409,00382446SHS3007,2019-09-16 04:00:36+00:00,382446,2019-09-16 02:00:00+00:00,1.0


In [83]:
target_train.sort_values(by='datum_click', ascending = False)

Unnamed: 0,userID,anbieter_artikelnummer,datum_click,anbieterID,datum_pick,pick
12095390,1657316,00565596LRE-FW-0014-0002,2019-06-28 21:18:37+00:00,565596,NaT,0.0
12995885,1677430,00100620101051000083,2019-06-28 21:18:21+00:00,100620,NaT,0.0
12012672,1657316,001895759020121-12,2019-06-28 21:18:20+00:00,189575,NaT,0.0
12091484,1657316,00561756XF1901-03WJ Rot,2019-06-28 21:18:11+00:00,561756,NaT,0.0
12072648,1657316,00497049ABYTEX314-L,2019-06-28 21:18:09+00:00,497049,NaT,0.0
...,...,...,...,...,...,...
6064717,1573710,00714655010032,2017-09-15 00:00:24+00:00,714655,NaT,0.0
596157,498808,00513388D712 N,2017-09-15 00:00:19+00:00,513388,2018-02-13 01:00:00+00:00,1.0
6108414,1575009,00144119EP0157,2017-09-15 00:00:17+00:00,144119,2017-09-15 02:00:00+00:00,1.0
2263968,804304,00714655006146,2017-09-15 00:00:04+00:00,714655,2017-09-15 02:00:00+00:00,1.0


In [84]:
target_train.to_pickle("data/training_data/target_train.pkl")

In [85]:
target_test.to_pickle("data/testing_data/target_test.pkl")