We want to start by preprocessing data and get it ready for the NN

In [66]:
import pandas as pd
import numpy as np

In [67]:
# Load the datasets
events = pd.read_csv('../data/events.csv')
prop1 = pd.read_csv('../data/item_properties_part1.csv')
prop2 = pd.read_csv('../data/item_properties_part2.csv')

properties = pd.concat([prop1, prop2], ignore_index=True)

In [68]:
properties.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [69]:
#events.sort_values(by=['visitorid', 'itemid'], inplace=True)
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


Properties is not going to be too useful to us, but we can incorporate it later in a content-based filtering system

## Preprocessing

Now we are trying to make a data matrix that has view counts, binary added-to-cart, and binary transaction.

In [70]:
views = events[events['event'] == 'view'][['visitorid', 'itemid']].copy()
views = views.groupby(['visitorid', 'itemid']).size().reset_index(name='view')

carts = events[events['event'] == 'addtocart'][['visitorid', 'itemid']].copy()
carts['cart'] = 1

transactions = events[events['event'] == 'transaction'][['visitorid', 'itemid']].copy()
transactions['transaction'] = 1

merged = pd.merge(views, carts, on=['visitorid', 'itemid'], how='outer')
merged = pd.merge(merged, transactions, on=['visitorid', 'itemid'], how='outer')

merged.fillna(0, inplace=True)
merged.head()

Unnamed: 0,visitorid,itemid,view,cart,transaction
0,0,67045,1.0,0.0,0.0
1,0,285930,1.0,0.0,0.0
2,0,357564,1.0,0.0,0.0
3,1,72028,1.0,0.0,0.0
4,2,216305,2.0,0.0,0.0


In [71]:
data = merged.copy()
data['score'] = np.minimum(np.log(1 + data['view']), 5) + 5*data['cart'] + 10*data['transaction']

data.head()

Unnamed: 0,visitorid,itemid,view,cart,transaction,score
0,0,67045,1.0,0.0,0.0,0.693147
1,0,285930,1.0,0.0,0.0,0.693147
2,0,357564,1.0,0.0,0.0,0.693147
3,1,72028,1.0,0.0,0.0,0.693147
4,2,216305,2.0,0.0,0.0,1.098612


Boom. I think we need to map everything to unique IDs now for the NN.

In [72]:
num_users = data['visitorid'].nunique()
num_items = data['itemid'].nunique()

print(f"Unique users: {num_users}")
print(f"Unique items: {num_items}")

Unique users: 1407580
Unique items: 235061


In [65]:
unique_users = data.visitorid.unique()
user_to_index = {old: new for new,old in enumerate(unique_users)}

unique_products = data.itemid.unique()
product_to_index = {old: new for new,old in enumerate(unique_products)}