In [None]:
# !mkdir /content/etc && git clone https://github.com/dao-v/Movie_Recommendation_System.git /content/etc

In [1]:
!mkdir -p /content/data
%cd /content/data
!curl -Lo yoochoose-data.7z https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z
!7z x yoochoose-data.7z

/content/data
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  273M  100  273M    0     0  24.6M      0  0:00:11  0:00:11 --:--:-- 28.1M

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Sca        1 file, 287211932 bytes (274 MiB)

Extracting archive: yoochoose-data.7z
--
Path = yoochoose-data.7z
Type = 7z
Physical Size = 287211932
Headers Size = 255
Method = LZMA:24
Solid = +
Blocks = 2

      0% - yoochoose-buys.da                          1% - yoochoose-buys.da                          2% - yoochoose-buys.da                          2% 1 - yoochoose-clicks.da                              3% 1 - yoochoose-clicks.da                              4% 1 - yoochoose-clicks.da                   

In [70]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
c_file = '/content/data/yoochoose-clicks.dat'
b_file = '/content/data/yoochoose-buys.dat'
test_data_file = '/content/data/yoochoose-test.dat'

c_index = ["session_id", "timestamp", "item_id", "category"]
b_index = ["session_id", "timestamp", "item_id", "price", "quantity"]

In [6]:
clicks = pd.read_csv(c_file, header=None, names=c_index, low_memory=False)
clicks.head(20)

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0
5,2,2014-04-07T13:57:19.373Z,214662742,0
6,2,2014-04-07T13:58:37.446Z,214825110,0
7,2,2014-04-07T13:59:50.710Z,214757390,0
8,2,2014-04-07T14:00:38.247Z,214757407,0
9,2,2014-04-07T14:02:36.889Z,214551617,0


In [7]:
buys = pd.read_csv(b_file, header=None, names=b_index)
buys.head(20)

Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1
5,140806,2014-04-07T09:22:28.132Z,214668193,523,1
6,140806,2014-04-07T09:22:28.176Z,214587399,1046,1
7,140806,2014-04-07T09:22:28.219Z,214586690,837,1
8,140806,2014-04-07T09:22:28.268Z,214774667,1151,1
9,140806,2014-04-07T09:22:28.280Z,214578823,1046,1


In [75]:
def parse_datetime(dt_str):
    return datetime.datetime.strptime(dt_str, '%Y-%m-%dT%H:%M:%S.%fZ')


# The categories can be S (for promotion), 0 (when unknown), 
# a number between 1-12 when it came from a category on the page
# or a 8-10 digit number that represents a brand

def assign_cat(x):
    if x == "S":
        return "PROMOTION"
    elif np.int(x) == 0:
        return "NONE"
    elif np.int(x) < 13:
        return "CATEGORY"
    else:
        return "BRAND"

def parse_clicks(df):
    df = df.astype({"session_id": int})
    df["timestamp"] = df["timestamp"].apply(parse_datetime)
    df["category"] = df.loc[:,'category'].map(assign_cat)
    return df

def parse_buys(df):
    df = df.astype({"session_id": int, "price": float, "quantity": int})
    df.drop(["timestamp"], inplace=True, axis=1)
    df["action"] = "BUY"
    return df

In [93]:
clicks = parse_clicks(clicks)
buys = parse_buys(buys)

In [None]:
df = pd.merge(left=clicks, right=buys, how="left", on=["session_id", "item_id"])
df.drop_duplicates(inplace=True)
display(df.head())
df.info()

In [None]:
query = "item_id==@ItemId & session_id==@SessionId"
ItemId = 214821371
SessionId = 11
display(clicks.query(query))
display(buys.query(query))
display(df.query(query))

### Sub-select data based on threshold

In [None]:
SESSION_THRESHOLD = 20
ITEM_THRESHOLD = 1000

In [None]:
session_lengths = df.groupby(["session_id"]).size()
session_lengths_w_threshold = (session_lengths[session_lengths>SESSION_THRESHOLD]).reset_index()
df_with_session_threshold = df[df.session_id.isin(session_lengths_w_threshold.session_id)]

In [None]:
item_lengths = df.groupby(["item_id"]).size()
item_lengths_w_threshold = item_lengths[item_lengths>ITEM_THRESHOLD]
item_lengths_w_threshold = item_lengths_w_threshold.reset_index()
df_with_session_item_threshold = df_with_session_threshold[df_with_session_threshold.item_id.isin(item_lengths_w_threshold.item_id)]

In [None]:
session_lengths_2 = df_with_session_item_threshold.groupby(["session_id"]).size()
session_lengths_2_w_threshold = (session_lengths_2[session_lengths_2 > SESSION_THRESHOLD]).reset_index()

In [None]:
df_final = df_with_session_item_threshold[df_with_session_item_threshold.session_id.isin(session_lengths_2_w_threshold.session_id)]
df_final.head()

In [None]:
df_final.action.fillna(value="CLICK", inplace=True)
df_final.drop(["price", "quantity"], axis=1, inplace=True)
df_final["rating"] = df_final.action.apply(lambda x: 1 if (x == "CLICK") else 5)
df_final.head()