In [1]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
tqdm.pandas()

In [2]:
candidate_items = pd.read_csv("../../Dataset/candidate_items.csv")
item_features = pd.read_csv("../../Dataset/item_features.csv")
train_purchases = pd.read_csv("../../Dataset/train_purchases.csv")
train_sessions = pd.read_csv("../../Dataset/train_sessions.csv")
test_final_sessions = pd.read_csv("../../Dataset/test_final_sessions.csv")
test_leaderboard_sessions = pd.read_csv("../../Dataset/test_leaderboard_sessions.csv")

In [3]:
session_attributes_train = pd.DataFrame(train_sessions.session_id.unique(), columns=['session_id'])
session_attributes_train['session_length'] = train_sessions.groupby('session_id')['item_id'].count().values

session_attributes_final = pd.DataFrame(test_final_sessions.session_id.unique(), columns=['session_id'])
session_attributes_final['session_length'] = test_final_sessions.groupby('session_id')['item_id'].count().values

session_attributes_leaderboard = pd.DataFrame(test_leaderboard_sessions.session_id.unique(), columns=['session_id'])
session_attributes_leaderboard['session_length'] = test_leaderboard_sessions.groupby('session_id')['item_id'].count().values

In [4]:
train_sessions_unique = train_sessions.drop_duplicates(['session_id', 'item_id'])
session_attributes_train['session_length_distinct'] = train_sessions_unique.groupby('session_id')['item_id'].count().values

final_sessions_unique = test_final_sessions.drop_duplicates(['session_id', 'item_id'])
session_attributes_final['session_length_distinct'] = final_sessions_unique.groupby('session_id')['item_id'].count().values

leaderboard_sessions_unique = test_leaderboard_sessions.drop_duplicates(['session_id', 'item_id'])
session_attributes_leaderboard['session_length_distinct'] = leaderboard_sessions_unique.groupby('session_id')['item_id'].count().values

In [5]:
session_attributes_train = session_attributes_train.merge(train_sessions_unique[['session_id', 'date']],
                              left_on=['session_id'], right_on=['session_id'])
session_attributes_train.date = pd.to_datetime(session_attributes_train.date)

session_attributes_final = session_attributes_final.merge(final_sessions_unique[['session_id', 'date']],
                              left_on=['session_id'], right_on=['session_id'])
session_attributes_final.date = pd.to_datetime(session_attributes_final.date)

session_attributes_leaderboard = session_attributes_leaderboard.merge(leaderboard_sessions_unique[['session_id', 'date']],
                              left_on=['session_id'], right_on=['session_id'])
session_attributes_leaderboard.date = pd.to_datetime(session_attributes_leaderboard.date)

In [6]:
session_attributes_train['day_of_week'] = session_attributes_train.date.map(lambda x: x.day_of_week)
session_attributes_train['month'] = session_attributes_train.date.map(lambda x: x.month)
session_attributes_train['year'] = session_attributes_train.date.map(lambda x: x.year)
session_attributes_train.drop(columns=['date'], inplace=True)

session_attributes_final['day_of_week'] = session_attributes_final.date.map(lambda x: x.day_of_week)
session_attributes_final['month'] = session_attributes_final.date.map(lambda x: x.month)
session_attributes_final['year'] = session_attributes_final.date.map(lambda x: x.year)
session_attributes_final.drop(columns=['date'], inplace=True)

session_attributes_leaderboard['day_of_week'] = session_attributes_leaderboard.date.map(lambda x: x.day_of_week)
session_attributes_leaderboard['month'] = session_attributes_leaderboard.date.map(lambda x: x.month)
session_attributes_leaderboard['year'] = session_attributes_leaderboard.date.map(lambda x: x.year)
session_attributes_leaderboard.drop(columns=['date'], inplace=True)

In [7]:
train_sessions_first = train_sessions.sort_values(by=['date'], ascending=True).drop_duplicates(['session_id']).sort_values(by=['session_id'], ascending=True)
train_sessions_first.date = pd.to_datetime(train_sessions_first.date)
train_sessions_first.rename(columns={"date": "date_first"}, inplace=True)
train_sessions_last = train_sessions.sort_values(by=['date'], ascending=False).drop_duplicates(['session_id']).sort_values(by=['session_id'], ascending=True)
train_sessions_last.date = pd.to_datetime(train_sessions_last.date)
train_sessions_last.rename(columns={"date": "date_last"}, inplace=True)

final_sessions_first = test_final_sessions.sort_values(by=['date'], ascending=True).drop_duplicates(['session_id']).sort_values(by=['session_id'], ascending=True)
final_sessions_first.date = pd.to_datetime(final_sessions_first.date)
final_sessions_first.rename(columns={"date": "date_first"}, inplace=True)
final_sessions_last = test_final_sessions.sort_values(by=['date'], ascending=False).drop_duplicates(['session_id']).sort_values(by=['session_id'], ascending=True)
final_sessions_last.date = pd.to_datetime(final_sessions_last.date)
final_sessions_last.rename(columns={"date": "date_last"}, inplace=True)

leaderboard_sessions_first = test_leaderboard_sessions.sort_values(by=['date'], ascending=True).drop_duplicates(['session_id']).sort_values(by=['session_id'], ascending=True)
leaderboard_sessions_first.date = pd.to_datetime(leaderboard_sessions_first.date)
leaderboard_sessions_first.rename(columns={"date": "date_first"}, inplace=True)
leaderboard_sessions_last = test_leaderboard_sessions.sort_values(by=['date'], ascending=False).drop_duplicates(['session_id']).sort_values(by=['session_id'], ascending=True)
leaderboard_sessions_last.date = pd.to_datetime(leaderboard_sessions_last.date)
leaderboard_sessions_last.rename(columns={"date": "date_last"}, inplace=True)

In [8]:
session_attributes_train = session_attributes_train.merge(train_sessions_first[['session_id', 'date_first']],
                              left_on=['session_id'], right_on=['session_id'])
session_attributes_train = session_attributes_train.merge(train_sessions_last[['session_id', 'date_last']],
                              left_on=['session_id'], right_on=['session_id'])

session_attributes_final = session_attributes_final.merge(final_sessions_first[['session_id', 'date_first']],
                              left_on=['session_id'], right_on=['session_id'])
session_attributes_final = session_attributes_final.merge(final_sessions_last[['session_id', 'date_last']],
                              left_on=['session_id'], right_on=['session_id'])

session_attributes_leaderboard = session_attributes_leaderboard.merge(leaderboard_sessions_first[['session_id', 'date_first']],
                              left_on=['session_id'], right_on=['session_id'])
session_attributes_leaderboard = session_attributes_leaderboard.merge(leaderboard_sessions_last[['session_id', 'date_last']],
                              left_on=['session_id'], right_on=['session_id'])

In [9]:
session_attributes_train['session_delta_time'] = session_attributes_train.apply(lambda x: (x.date_last - x.date_first).seconds, axis=1)

session_attributes_final['session_delta_time'] = session_attributes_final.apply(lambda x: (x.date_last - x.date_first).seconds, axis=1)

session_attributes_leaderboard['session_delta_time'] = session_attributes_leaderboard.apply(lambda x: (x.date_last - x.date_first).seconds, axis=1)

In [10]:
session_attributes_train['session_mean_item_time'] = session_attributes_train.apply(lambda x: x.session_delta_time/x.session_length, axis=1)

session_attributes_final['session_mean_item_time'] = session_attributes_final.apply(lambda x: x.session_delta_time/x.session_length, axis=1)

session_attributes_leaderboard['session_mean_item_time'] = session_attributes_leaderboard.apply(lambda x: x.session_delta_time/x.session_length, axis=1)

In [11]:
seen_items = set()
session_score_train = {key:0 for key in train_sessions.session_id.unique()}
session_score_final = {key:0 for key in test_final_sessions.session_id.unique()}
session_score_leaderboard = {key:0 for key in test_leaderboard_sessions.session_id.unique()}

train_sessions.sort_values(by='date', inplace=True, ascending=True)
length = train_sessions.shape[0]
for i in tqdm(range(length)):
    row = train_sessions.iloc[i]
    actual_len = len(seen_items)
    seen_items.add(row.item_id)
    if actual_len < len(seen_items):
        session_score_train[row.session_id] += 1

test_final_sessions.sort_values(by='date', inplace=True, ascending=True)
length = test_final_sessions.shape[0]
for i in tqdm(range(length)):
    row = test_final_sessions.iloc[i]
    if row.item_id not in seen_items:
        session_score_final[row.session_id] += 1

test_leaderboard_sessions.sort_values(by='date', inplace=True, ascending=True)
length = test_leaderboard_sessions.shape[0]
for i in tqdm(range(length)):
    row = test_leaderboard_sessions.iloc[i]
    if row.item_id not in seen_items:
        session_score_leaderboard[row.session_id] += 1

100%|██████████| 4743820/4743820 [07:06<00:00, 11120.22it/s]
100%|██████████| 226138/226138 [00:20<00:00, 11104.44it/s]
100%|██████████| 229354/229354 [00:20<00:00, 11280.23it/s]


In [12]:
session_attributes_train['num_never_seen_items'] = session_attributes_train.session_id.map(session_score_train)

session_attributes_final['num_never_seen_items'] = session_attributes_final.session_id.map(session_score_final)

session_attributes_leaderboard['num_never_seen_items'] = session_attributes_leaderboard.session_id.map(session_score_leaderboard)

In [13]:
purchased_items = set()
session_purchase_score_train = {key:0 for key in train_sessions.session_id.unique()}
session_purchase_score_final = {key:0 for key in test_final_sessions.session_id.unique()}
session_purchase_score_leaderboard = {key:0 for key in test_leaderboard_sessions.session_id.unique()}
train_sessions['purchase'] = False
train_purchases['purchase'] = True
train_interactions = pd.concat([train_sessions, train_purchases], axis=0)
train_interactions.sort_values(by='date', inplace=True, ascending=True)
length = train_interactions.shape[0]
for i in tqdm(range(length)):
    row = train_interactions.iloc[i]
    if row.purchase:
        purchased_items.add(row.item_id)
    if (not row.purchase) and (row.session_id not in purchased_items):
        session_purchase_score_train[row.session_id] += 1


test_final_sessions.sort_values(by='date', inplace=True, ascending=True)
length = test_final_sessions.shape[0]
for i in tqdm(range(length)):
    row = test_final_sessions.iloc[i]
    if row.session_id not in purchased_items:
        session_purchase_score_final[row.session_id] += 1

test_leaderboard_sessions.sort_values(by='date', inplace=True, ascending=True)
length = test_leaderboard_sessions.shape[0]
for i in tqdm(range(length)):
    row = test_leaderboard_sessions.iloc[i]
    if row.session_id not in purchased_items:
        session_purchase_score_leaderboard[row.session_id] += 1

100%|██████████| 5743820/5743820 [10:28<00:00, 9137.91it/s] 
100%|██████████| 226138/226138 [00:27<00:00, 8354.76it/s]
100%|██████████| 229354/229354 [00:25<00:00, 8875.40it/s]


In [14]:
session_attributes_train['num_never_purchased_items'] = session_attributes_train.session_id.map(session_purchase_score_train)

session_attributes_final['num_never_purchased_items'] = session_attributes_final.session_id.map(session_purchase_score_final)

session_attributes_leaderboard['num_never_purchased_items'] = session_attributes_leaderboard.session_id.map(session_purchase_score_leaderboard)

In [15]:
session_attributes_train.drop(columns=['date_first', 'date_last'], inplace=True)
session_attributes_final.drop(columns=['date_first', 'date_last'], inplace=True)
session_attributes_leaderboard.drop(columns=['date_first', 'date_last'], inplace=True)

In [16]:
session_attributes_train.drop_duplicates(['session_id'], inplace=True)
session_attributes_train

Unnamed: 0,session_id,session_length,session_length_distinct,day_of_week,month,year,session_delta_time,session_mean_item_time,num_never_seen_items,num_never_purchased_items
0,3,2,1,4,12,2020,312,156.000000,0,0
1,13,1,1,4,3,2020,0,0.000000,0,1
2,18,3,3,2,8,2020,163,54.333333,0,0
5,19,17,17,0,11,2020,1082,63.647059,0,17
22,24,9,8,2,2,2020,3703,411.444444,0,0
...,...,...,...,...,...,...,...,...,...,...
4108808,4439986,6,6,3,5,2021,533,88.833333,0,6
4108814,4439990,11,10,5,8,2020,3458,314.363636,0,11
4108824,4439994,1,1,4,11,2020,0,0.000000,0,1
4108825,4439999,7,7,4,11,2020,436,62.285714,0,7


In [17]:
session_attributes_final.drop_duplicates(['session_id'], inplace=True)
session_attributes_final

Unnamed: 0,session_id,session_length,session_length_distinct,day_of_week,month,year,session_delta_time,session_mean_item_time,num_never_seen_items,num_never_purchased_items
0,61,1,1,1,6,2021,0,0.0,0,1
1,96,5,5,5,6,2021,496,99.2,0,5
6,185,5,3,0,6,2021,116,23.2,0,0
9,224,2,2,0,6,2021,19892,9946.0,0,0
11,285,1,1,1,6,2021,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
186867,4439646,4,4,5,6,2021,260,65.0,1,4
186871,4439648,1,1,0,6,2021,0,0.0,0,1
186872,4439675,1,1,1,6,2021,0,0.0,0,1
186873,4439868,1,1,2,6,2021,0,0.0,0,1


In [18]:
session_attributes_leaderboard.drop_duplicates(['session_id'], inplace=True)
session_attributes_leaderboard

Unnamed: 0,session_id,session_length,session_length_distinct,day_of_week,month,year,session_delta_time,session_mean_item_time,num_never_seen_items,num_never_purchased_items
0,26,1,1,2,6,2021,0,0.000000,0,0
1,200,4,3,4,6,2021,69,17.250000,0,4
4,205,1,1,4,6,2021,0,0.000000,0,0
5,495,1,1,0,6,2021,0,0.000000,1,0
6,521,1,1,5,6,2021,0,0.000000,0,1
...,...,...,...,...,...,...,...,...,...,...
189269,4439446,3,2,4,6,2021,59,19.666667,0,3
189271,4439458,2,2,2,6,2021,162,81.000000,0,2
189273,4439550,1,1,2,6,2021,0,0.000000,0,1
189274,4439653,10,7,4,6,2021,205,20.500000,0,10


In [25]:
session_attributes_train.to_csv('../../Dataset/session_attributes_train.csv', index=False)
session_attributes_final.to_csv('../../Dataset/session_attributes_final.csv', index=False)
session_attributes_leaderboard.to_csv('../../Dataset/session_attributes_leaderboard.csv', index=False)