# 0. Import packages

In [1]:
import numpy as np
import pandas as pd
import sklearn
import gzip
import json
from tqdm import tqdm
import os
from collections import Counter
from datetime import datetime
import math
tqdm.pandas() #for progres_apply etc.

In [2]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
  #read gzipped content
  f=gzip.open(filename_gzipped_python_json,'r')
  
  #parse json
  parse_data = []
  for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
    line = line.decode('utf-8')
    line = line.replace('true','True') #difference json/python
    line = line.replace('false','False')
    parsed_result = eval(line) #load python nested datastructure
    parse_data.append(parsed_result)
    if read_max !=-1 and len(parse_data) > read_max:
      print(f'Break reading after {read_max} records')
      break
  print(f"Reading {len(parse_data)} rows.")

  #create dataframe
  df= pd.DataFrame.from_dict(parse_data)
  return df

# 1. Load Steam data

In [3]:
steam_path = 'data/'
metadata_games = 'steam_games.json.gz' 
user_items = 'australian_users_items.json.gz'
user_reviews = 'australian_user_reviews.json.gz'
game_bundles = 'bundle_data.json.gz'
steam_reviews= 'steam_reviews.json.gz'

In [4]:
dataframes = []
for dataset in [ user_reviews, steam_reviews]:
# for dataset in [ metadata_games, user_items, user_reviews, game_bundles, steam_reviews]:
  print(f"----- {dataset}-----")
  size = os.path.getsize(steam_path + dataset) 
  print(f'Size of file is {size / 1000000}MB')
  df_metadata = parse_json(steam_path + dataset)
  dataframes.append(df_metadata)
  pd.set_option('display.max_colwidth', None)
  display(df_metadata.head(2))
  display(df_metadata.describe(include='all'))

----- australian_user_reviews.json.gz-----
Size of file is 6.940139MB


25799it [00:01, 17074.17it/s]


Reading 25799 rows.


Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.'}, {'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'It's unique and worth a playthrough.'}, {'funny': '', 'posted': 'Posted April 21, 2011.', 'last_edited': '', 'item_id': '43110', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}]"
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014.', 'last_edited': '', 'item_id': '251610', 'helpful': '15 of 20 people (75%) found this review helpful', 'recommend': True, 'review': 'I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what True fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8'}, {'funny': '', 'posted': 'Posted September 8, 2013.', 'last_edited': '', 'item_id': '227300', 'helpful': '0 of 1 people (0%) found this review helpful', 'recommend': True, 'review': 'For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.'}, {'funny': '', 'posted': 'Posted November 29, 2013.', 'last_edited': '', 'item_id': '239030', 'helpful': '1 of 4 people (25%) found this review helpful', 'recommend': True, 'review': 'Very fun little game to play when your bored or as a time passer. Very gud. Do Recommend. pls buy'}]"


Unnamed: 0,user_id,user_url,reviews
count,25799,25799,25799
unique,25485,25485,25459
top,76561198027488037,http://steamcommunity.com/profiles/76561198027488037,[]
freq,3,3,28


----- steam_reviews.json.gz-----
Size of file is 1350.067901MB


7793069it [04:37, 28047.64it/s]


Reading 7793069 rows.


Unnamed: 0,username,product_id,page_order,text,hours,recommended,products,date,early_access,page,compensation,found_funny,user_id
0,Chaos Syren,725280,0,This would not be acceptable as an entertainment even back in the day when these graphics were all there was to be had. No effort has been made to bring the player into any story or even entertain.,0.1,True,41.0,2017-12-17,False,1,,,
1,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,0,looks like a facebook game,51.1,True,769.0,2017-12-27,False,1,,,


Unnamed: 0,username,product_id,page_order,text,hours,recommended,products,date,early_access,page,compensation,found_funny,user_id
count,7793069.0,7793069.0,7793069.0,7793069.0,7766532.0,7793069,7778108.0,7793069,7793069,7793069.0,145623,1200756.0,3176223.0
unique,2567532.0,15474.0,,6350454.0,,1,,2628,2,,1,,1485611.0
top,123.0,440.0,,,,True,,2016-11-23,False,,Product received for free,,7.656119801196536e+16
freq,2045.0,183666.0,,17963.0,,7793069,,64245,6719736,,145623,,1254.0
mean,,,4.483445,,111.8365,,236.4839,,,893.0331,,7.896183,
std,,,2.872802,,392.8604,,485.7889,,,1927.711,,73.38724,
min,,,0.0,,0.0,,1.0,,,1.0,,1.0,
25%,,,2.0,,4.0,,45.0,,,52.0,,1.0,
50%,,,4.0,,15.3,,110.0,,,237.0,,1.0,
75%,,,7.0,,59.7,,246.0,,,829.0,,3.0,


In [10]:
t = dataframes[0]
k = t.explode("reviews").reset_index(drop=True)
k

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.'}"
1,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"{'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'It's unique and worth a playthrough.'}"
2,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"{'funny': '', 'posted': 'Posted April 21, 2011.', 'last_edited': '', 'item_id': '43110', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}"
3,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014.', 'last_edited': '', 'item_id': '251610', 'helpful': '15 of 20 people (75%) found this review helpful', 'recommend': True, 'review': 'I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what True fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8'}"
4,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2013.', 'last_edited': '', 'item_id': '227300', 'helpful': '0 of 1 people (0%) found this review helpful', 'recommend': True, 'review': 'For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.'}"
...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,"{'funny': '', 'posted': 'Posted July 10.', 'last_edited': '', 'item_id': '70', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'a must have classic from steam definitely worth buying.'}"
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,"{'funny': '', 'posted': 'Posted July 8.', 'last_edited': '', 'item_id': '362890', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'this game is a perfect remake of the original half life. personally one of the best remakes i have played in a long time. there are a few changes in the remake but for the most part its almost the same as the original half life.the game still needs Xen to be completed but all the other chapters are ready for you to play and enjoy. i say buy this game if you loved the original half life. but avoid it if you can t wait for xen to be completed.'}"
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '1 person found this review funny', 'posted': 'Posted July 3.', 'last_edited': '', 'item_id': '273110', 'helpful': '1 of 2 people (50%) found this review helpful', 'recommend': True, 'review': 'had so much fun plaing this and collecting resources xD we won on my first try and killed final boss!'}"
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '', 'posted': 'Posted July 20.', 'last_edited': '', 'item_id': '730', 'helpful': 'No ratings yet', 'recommend': True, 'review': ':D'}"


In [11]:
k = pd.concat([k.drop(['reviews'], axis=1), k['reviews'].progress_apply(pd.Series)], axis=1)
k

100%|██████████| 59333/59333 [00:13<00:00, 4357.91it/s] 


Unnamed: 0,user_id,user_url,0,funny,helpful,item_id,last_edited,posted,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,,,No ratings yet,1250,,"Posted November 5, 2011.",True,"Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare."
1,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,,,No ratings yet,22200,,"Posted July 15, 2011.",True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,,,No ratings yet,43110,,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!
3,js41637,http://steamcommunity.com/id/js41637,,,15 of 20 people (75%) found this review helpful,251610,,"Posted June 24, 2014.",True,"I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what True fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8"
4,js41637,http://steamcommunity.com/id/js41637,,,0 of 1 people (0%) found this review helpful,227300,,"Posted September 8, 2013.",True,"For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it."
...,...,...,...,...,...,...,...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,,,No ratings yet,70,,Posted July 10.,True,a must have classic from steam definitely worth buying.
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,,,No ratings yet,362890,,Posted July 8.,True,this game is a perfect remake of the original half life. personally one of the best remakes i have played in a long time. there are a few changes in the remake but for the most part its almost the same as the original half life.the game still needs Xen to be completed but all the other chapters are ready for you to play and enjoy. i say buy this game if you loved the original half life. but avoid it if you can t wait for xen to be completed.
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,1 person found this review funny,1 of 2 people (50%) found this review helpful,273110,,Posted July 3.,True,had so much fun plaing this and collecting resources xD we won on my first try and killed final boss!
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,No ratings yet,730,,Posted July 20.,True,:D


In [12]:
k.loc[k['recommend'] == False]
k.describe(include='all')
k = k[["user_id", "item_id", "recommend"]]
k

Unnamed: 0,user_id,item_id,recommend
0,76561197970982479,1250,True
1,76561197970982479,22200,True
2,76561197970982479,43110,True
3,js41637,251610,True
4,js41637,227300,True
...,...,...,...
59328,76561198312638244,70,True
59329,76561198312638244,362890,True
59330,LydiaMorley,273110,True
59331,LydiaMorley,730,True


In [13]:
l = dataframes[1]
l = l[["user_id", "product_id", "recommended"]]
l = l.rename(columns={'product_id': "item_id", "recommended": 'recommend'})
l

Unnamed: 0,user_id,item_id,recommend
0,,725280,True
1,,328100,True
2,,328100,True
3,,35140,True
4,76561198007483075,35140,True
...,...,...,...
7793064,,252490,True
7793065,76561198089897928,252490,True
7793066,76561198048207033,252490,True
7793067,,252490,True


In [31]:
# u = k.append(l)
u = l
u = u.dropna()
u

Unnamed: 0,user_id,item_id,recommend
4,76561198007483075,35140,True
8,76561197970402776,707610,True
11,76561198060686749,328100,True
13,76561198023491401,35140,True
16,76561198115331805,35140,True
...,...,...,...
7793058,76561197962161824,252490,True
7793061,76561198010660367,252490,True
7793062,76561197983773018,252490,True
7793065,76561198089897928,252490,True


In [32]:
dct = {}
def map_to_consecutive_id(uuid):
  if uuid in dct:
    return dct[uuid]
  else:
    id = len(dct)
    dct[uuid] = id
    return id

#1) convert user user_ids to consecutive integer ID's
u['user_id_int'] = u['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
u['item_id_int'] = u['item_id'].progress_apply(map_to_consecutive_id)

u.head()

100%|██████████| 3176223/3176223 [00:03<00:00, 844291.53it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  u['user_id_int'] = u['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 3176223/3176223 [00:02<00:00, 1282803.47it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  u['item_id_int'] = u['item_id'].progress_apply(map_to_consecutive_id)


Unnamed: 0,user_id,item_id,recommend,user_id_int,item_id_int
4,76561198007483075,35140,True,0,0
8,76561197970402776,707610,True,1,1
11,76561198060686749,328100,True,2,2
13,76561198023491401,35140,True,3,0
16,76561198115331805,35140,True,4,0


In [33]:
u

Unnamed: 0,user_id,item_id,recommend,user_id_int,item_id_int
4,76561198007483075,35140,True,0,0
8,76561197970402776,707610,True,1,1
11,76561198060686749,328100,True,2,2
13,76561198023491401,35140,True,3,0
16,76561198115331805,35140,True,4,0
...,...,...,...,...,...
7793058,76561197962161824,252490,True,1485609,13997
7793061,76561198010660367,252490,True,1212724,13997
7793062,76561197983773018,252490,True,1485596,13997
7793065,76561198089897928,252490,True,1485610,13997


# 4. Pre-process interactions
- Drop reconsumption items
- Remove items with fewer than x interactions
- Remove users with fewer than x interactions

In [52]:
def preprocess_classic(df, minsup=3):
  """
  Goal: - Remove reconsumption items
        - Remove items that have less than minsup interactions
        - Remove users that have less than minsup interactions

  :input df: Dataframe containing user_id, item_id and time
  """
  before = df.shape[0]
  #drop reconsumption items
  df = df.drop_duplicates(subset=["user_id","item_id"])
  print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
  #compute item/user counts
  g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
  g1 = g1.rename({'size': 'count_item'}, axis='columns')
  g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
  g2 = g2.rename({'size': 'count_user'}, axis='columns')
  df = pd.merge(df, g1, how='left', on=['item_id'])
  df = pd.merge(df, g2, how='left', on=['user_id'])
  display(df.head(5))
  #drop items occurring less than minsup times
  before = df.shape[0]
  df = df[df['count_item'] >= minsup]
  print("After dropping items with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
  before = df.shape[0]
  #drop users with less then minsup items in history
  df = df[df['count_user'] >= minsup]
  df = df[['user_id','item_id', "recommend"]]
  print("After dropping users with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
  return df

#Remark: ignoring rating, considering all reviews as implicit possitive feedback
#print number of users and items
interactions_df_processed = u[['user_id_int','item_id_int', "recommend"]]
interactions_df_processed = interactions_df_processed.rename(columns={"user_id_int": "user_id", "item_id_int": "item_id"})
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")
interactions_df_processed = preprocess_classic(interactions_df_processed)
display(interactions_df_processed.head(5))
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

number of unique users: 1485611
number of unique items: 14513
After drop_duplicates (reconsumption items): 3176223 -> 2832522


Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,0,0,True,3173,6
1,1,1,True,1,7
2,2,2,True,22,55
3,3,0,True,3173,109
4,4,0,True,3173,1


After dropping items with less than 3 interactions: 2832522 -> 2829087
After dropping users with less than 3 interactions: 2829087 -> 1344188


Unnamed: 0,user_id,item_id,recommend
0,0,0,True
2,2,2,True
3,3,0,True
6,6,4,True
7,7,2,True


number of unique users: 224690
number of unique items: 11992


In [53]:
interactions_df_processed

Unnamed: 0,user_id,item_id,recommend
0,0,0,True
2,2,2,True
3,3,0,True
6,6,4,True
7,7,2,True
...,...,...,...
2832503,278009,13997,True
2832508,492340,13997,True
2832510,424642,13997,True
2832513,100002,13997,True


# 5. Create train/test split
Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [54]:
#Session-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id', "recommend"]].agg(list)
display(sessions_df.head(10))

def split(items, percentage_train):
  no_train_items = math.floor(len(items) * percentage_train)
  return items[0:no_train_items], items[no_train_items:]

percentage_train = 0.8
sessions_df['train'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[0])
sessions_df['test'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[1])

sessions_df.head(10)

Unnamed: 0,user_id,item_id,recommend
0,0,"[0, 3871, 6589, 11847, 12663, 14499]","[True, True, True, True, True, True]"
1,1,"[3228, 4392, 8009, 10348, 11846, 13666]","[True, True, True, True, True, True]"
2,2,"[2, 135, 735, 1035, 1284, 160, 1559, 2422, 2537, 2626, 2718, 2846, 2885, 3057, 3131, 3499, 3898, 3828, 4207, 4385, 4435, 4605, 5042, 5488, 5714, 5339, 5930, 6167, 6585, 6795, 6956, 7250, 7449, 7542, 7926, 8190, 8330, 8479, 8503, 8784, 8795, 8840, 8980, 9038, 9014, 10166, 9395, 10850, 11086, 11182, 12081, 12758, 13165, 13308, 14345]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]"
3,3,"[0, 135, 284, 494, 611, 724, 902, 1020, 1211, 1368, 1407, 1462, 1540, 1739, 1925, 1931, 1934, 2130, 2220, 2389, 2519, 2556, 2633, 2643, 2820, 2898, 3111, 3123, 3163, 3187, 3716, 3747, 3806, 3871, 4249, 4484, 4743, 4781, 4826, 4922, 5747, 5825, 6220, 6250, 6267, 6351, 6582, 6956, 7063, 7193, 7265, 7343, 7352, 7704, 7765, 7847, 7966, 7971, 8067, 8690, 8821, 8823, 8886, 8980, 9390, 9647, 9999, 10076, 10088, 10096, 10197, 10287, 10300, 10819, 11143, 11194, 11202, 11319, 11368, 11691, 11850, 11926, 12028, 12101, 12107, 12129, 12232, 12246, 12423, 12488, 12616, 12644, 12678, 12960, 13001, 13037, 13196, 13529, 13567, 13586, ...]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ...]"
4,6,"[4, 1622, 12464]","[True, True, True]"
5,7,"[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161, 159, 40, 200, 212, 135, 218, 136, 102, 229, 240, 0, 283, 299, 316, 338, 344, 341, 371, 368, 402, 398, 385, 426, 424, 431, 429, 465, 469, 478, 479, 480, 491, 495, 501, 509, 528, 544, 561, 569, 572, 555, 573, 587, 600, 602, 615, 643, 656, 655, 611, 665, 663, 654, 673, 689, 697, 704, 726, 731, 742, 748, 777, 724, 716, 799, 817, 826, 900, 945, 946, 956, 973, 980, 982, 1004, 948, 1015, 898, 1024, 1037, 1079, 1092, 1095, 1089, 1075, 1110, 1119, 1187, 1171, ...]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ...]"
6,9,"[5, 3732, 3783, 8059]","[True, True, True, True]"
7,10,"[6, 11, 36, 121, 119, 65, 241, 268, 267, 385, 510, 540, 568, 645, 367, 745, 783, 885, 897, 909, 942, 945, 967, 898, 1096, 1169, 1210, 1225, 1247, 1155, 1375, 1396, 1442, 1690, 1805, 1806, 1907, 1938, 1963, 1968, 1974, 1967, 2127, 2061, 2384, 2517, 2525, 2629, 2657, 2701, 2712, 2732, 2762, 2828, 2862, 2857, 2678, 2938, 2926, 3082, 3127, 3141, 3180, 3202, 3369, 3435, 3449, 3507, 3572, 3629, 3678, 3680, 3822, 3899, 3906, 3977, 3989, 4099, 4038, 4141, 4379, 4404, 4493, 4604, 4653, 4768, 4754, 4820, 4895, 4510, 5014, 5105, 5173, 5186, 5225, 5253, 5288, 5368, 5467, 5454, ...]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ...]"
8,11,"[7, 222, 694, 259, 1096, 1183, 2053, 2868, 3159, 5458, 8081, 12628]","[True, True, True, True, True, True, True, True, True, True, True, True]"
9,13,"[36, 380, 473, 938, 1163, 1315, 1390, 1471, 1621, 2780, 3083, 4235, 4417, 4570, 4652, 5072, 5155, 5406, 5443, 5472, 5852, 6028, 6548, 7207, 7716, 7819, 8144, 8177, 8392, 8898, 9346, 9911, 10055, 10182, 10470, 10876, 10891, 10907, 11382, 11828, 12209, 12254, 12512, 13053, 12880, 13908, 14038]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]"


Unnamed: 0,user_id,item_id,recommend,train,test
0,0,"[0, 3871, 6589, 11847, 12663, 14499]","[True, True, True, True, True, True]","[0, 3871, 6589, 11847]","[12663, 14499]"
1,1,"[3228, 4392, 8009, 10348, 11846, 13666]","[True, True, True, True, True, True]","[3228, 4392, 8009, 10348]","[11846, 13666]"
2,2,"[2, 135, 735, 1035, 1284, 160, 1559, 2422, 2537, 2626, 2718, 2846, 2885, 3057, 3131, 3499, 3898, 3828, 4207, 4385, 4435, 4605, 5042, 5488, 5714, 5339, 5930, 6167, 6585, 6795, 6956, 7250, 7449, 7542, 7926, 8190, 8330, 8479, 8503, 8784, 8795, 8840, 8980, 9038, 9014, 10166, 9395, 10850, 11086, 11182, 12081, 12758, 13165, 13308, 14345]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]","[2, 135, 735, 1035, 1284, 160, 1559, 2422, 2537, 2626, 2718, 2846, 2885, 3057, 3131, 3499, 3898, 3828, 4207, 4385, 4435, 4605, 5042, 5488, 5714, 5339, 5930, 6167, 6585, 6795, 6956, 7250, 7449, 7542, 7926, 8190, 8330, 8479, 8503, 8784, 8795, 8840, 8980, 9038]","[9014, 10166, 9395, 10850, 11086, 11182, 12081, 12758, 13165, 13308, 14345]"
3,3,"[0, 135, 284, 494, 611, 724, 902, 1020, 1211, 1368, 1407, 1462, 1540, 1739, 1925, 1931, 1934, 2130, 2220, 2389, 2519, 2556, 2633, 2643, 2820, 2898, 3111, 3123, 3163, 3187, 3716, 3747, 3806, 3871, 4249, 4484, 4743, 4781, 4826, 4922, 5747, 5825, 6220, 6250, 6267, 6351, 6582, 6956, 7063, 7193, 7265, 7343, 7352, 7704, 7765, 7847, 7966, 7971, 8067, 8690, 8821, 8823, 8886, 8980, 9390, 9647, 9999, 10076, 10088, 10096, 10197, 10287, 10300, 10819, 11143, 11194, 11202, 11319, 11368, 11691, 11850, 11926, 12028, 12101, 12107, 12129, 12232, 12246, 12423, 12488, 12616, 12644, 12678, 12960, 13001, 13037, 13196, 13529, 13567, 13586, ...]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ...]","[0, 135, 284, 494, 611, 724, 902, 1020, 1211, 1368, 1407, 1462, 1540, 1739, 1925, 1931, 1934, 2130, 2220, 2389, 2519, 2556, 2633, 2643, 2820, 2898, 3111, 3123, 3163, 3187, 3716, 3747, 3806, 3871, 4249, 4484, 4743, 4781, 4826, 4922, 5747, 5825, 6220, 6250, 6267, 6351, 6582, 6956, 7063, 7193, 7265, 7343, 7352, 7704, 7765, 7847, 7966, 7971, 8067, 8690, 8821, 8823, 8886, 8980, 9390, 9647, 9999, 10076, 10088, 10096, 10197, 10287, 10300, 10819, 11143, 11194, 11202, 11319, 11368, 11691, 11850, 11926, 12028, 12101, 12107, 12129, 12232]","[12246, 12423, 12488, 12616, 12644, 12678, 12960, 13001, 13037, 13196, 13529, 13567, 13586, 13714, 13735, 13741, 13911, 14015, 14041, 14169, 14303, 14406]"
4,6,"[4, 1622, 12464]","[True, True, True]","[4, 1622]",[12464]
5,7,"[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161, 159, 40, 200, 212, 135, 218, 136, 102, 229, 240, 0, 283, 299, 316, 338, 344, 341, 371, 368, 402, 398, 385, 426, 424, 431, 429, 465, 469, 478, 479, 480, 491, 495, 501, 509, 528, 544, 561, 569, 572, 555, 573, 587, 600, 602, 615, 643, 656, 655, 611, 665, 663, 654, 673, 689, 697, 704, 726, 731, 742, 748, 777, 724, 716, 799, 817, 826, 900, 945, 946, 956, 973, 980, 982, 1004, 948, 1015, 898, 1024, 1037, 1079, 1092, 1095, 1089, 1075, 1110, 1119, 1187, 1171, ...]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ...]","[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161, 159, 40, 200, 212, 135, 218, 136, 102, 229, 240, 0, 283, 299, 316, 338, 344, 341, 371, 368, 402, 398, 385, 426, 424, 431, 429, 465, 469, 478, 479, 480, 491, 495, 501, 509, 528, 544, 561, 569, 572, 555, 573, 587, 600, 602, 615, 643, 656, 655, 611, 665, 663, 654, 673, 689, 697, 704, 726, 731, 742, 748, 777, 724, 716, 799, 817, 826, 900, 945, 946, 956, 973, 980, 982, 1004, 948, 1015, 898, 1024, 1037, 1079, 1092, 1095, 1089, 1075, 1110, 1119, 1187, 1171, ...]","[11782, 11807, 11717, 11831, 11843, 11842, 11847, 11869, 11864, 11882, 11799, 11896, 11898, 11913, 11920, 11926, 11932, 11959, 11962, 11945, 11956, 11979, 11994, 12012, 11997, 12025, 12033, 12015, 12068, 12050, 12093, 12102, 12101, 12104, 12120, 12159, 12186, 12189, 12078, 12097, 12216, 12227, 12226, 12204, 12028, 12273, 12246, 12323, 12284, 12325, 12107, 12331, 12354, 12370, 12361, 12352, 12394, 12405, 12416, 12415, 12420, 12433, 12443, 12454, 12465, 12472, 12474, 12448, 12493, 12497, 12513, 12535, 12232, 12533, 12509, 12571, 12567, 12582, 12610, 12609, 12612, 12586, 12616, 12629, 12637, 12639, 12659, 12652, 12664, 12689, 12685, 12705, 12711, 12695, 12703, 12720, 12745, 12758, 12771, 12787, ...]"
6,9,"[5, 3732, 3783, 8059]","[True, True, True, True]","[5, 3732, 3783]",[8059]
7,10,"[6, 11, 36, 121, 119, 65, 241, 268, 267, 385, 510, 540, 568, 645, 367, 745, 783, 885, 897, 909, 942, 945, 967, 898, 1096, 1169, 1210, 1225, 1247, 1155, 1375, 1396, 1442, 1690, 1805, 1806, 1907, 1938, 1963, 1968, 1974, 1967, 2127, 2061, 2384, 2517, 2525, 2629, 2657, 2701, 2712, 2732, 2762, 2828, 2862, 2857, 2678, 2938, 2926, 3082, 3127, 3141, 3180, 3202, 3369, 3435, 3449, 3507, 3572, 3629, 3678, 3680, 3822, 3899, 3906, 3977, 3989, 4099, 4038, 4141, 4379, 4404, 4493, 4604, 4653, 4768, 4754, 4820, 4895, 4510, 5014, 5105, 5173, 5186, 5225, 5253, 5288, 5368, 5467, 5454, ...]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ...]","[6, 11, 36, 121, 119, 65, 241, 268, 267, 385, 510, 540, 568, 645, 367, 745, 783, 885, 897, 909, 942, 945, 967, 898, 1096, 1169, 1210, 1225, 1247, 1155, 1375, 1396, 1442, 1690, 1805, 1806, 1907, 1938, 1963, 1968, 1974, 1967, 2127, 2061, 2384, 2517, 2525, 2629, 2657, 2701, 2712, 2732, 2762, 2828, 2862, 2857, 2678, 2938, 2926, 3082, 3127, 3141, 3180, 3202, 3369, 3435, 3449, 3507, 3572, 3629, 3678, 3680, 3822, 3899, 3906, 3977, 3989, 4099, 4038, 4141, 4379, 4404, 4493, 4604, 4653, 4768, 4754, 4820, 4895, 4510, 5014, 5105, 5173, 5186, 5225, 5253, 5288, 5368, 5467, 5454, ...]","[11961, 12054, 12065, 12133, 12180, 12258, 12272, 12209, 12254, 12342, 12372, 12462, 12564, 12694, 12693, 12795, 12819, 12856, 12892, 12962, 12994, 12232, 12644, 13082, 13151, 13204, 13095, 13230, 13296, 13368, 13435, 13455, 13547, 13551, 13601, 13630, 13615, 13642, 13667, 13705, 13707, 13743, 13824, 13755, 13902, 13878, 14016, 14031, 14140, 14237, 14336, 14404, 14440, 14303]"
8,11,"[7, 222, 694, 259, 1096, 1183, 2053, 2868, 3159, 5458, 8081, 12628]","[True, True, True, True, True, True, True, True, True, True, True, True]","[7, 222, 694, 259, 1096, 1183, 2053, 2868, 3159]","[5458, 8081, 12628]"
9,13,"[36, 380, 473, 938, 1163, 1315, 1390, 1471, 1621, 2780, 3083, 4235, 4417, 4570, 4652, 5072, 5155, 5406, 5443, 5472, 5852, 6028, 6548, 7207, 7716, 7819, 8144, 8177, 8392, 8898, 9346, 9911, 10055, 10182, 10470, 10876, 10891, 10907, 11382, 11828, 12209, 12254, 12512, 13053, 12880, 13908, 14038]","[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]","[36, 380, 473, 938, 1163, 1315, 1390, 1471, 1621, 2780, 3083, 4235, 4417, 4570, 4652, 5072, 5155, 5406, 5443, 5472, 5852, 6028, 6548, 7207, 7716, 7819, 8144, 8177, 8392, 8898, 9346, 9911, 10055, 10182, 10470, 10876, 10891]","[10907, 11382, 11828, 12209, 12254, 12512, 13053, 12880, 13908, 14038]"


# 6. Evaluate quantitatively
Options are:
- **Hitrate@k**, i.e. percentage of users where top-$k$ recommendations is relevant
- **Recall@k**, i.e. percentage of top-$k$ recommendation that are relevant
- **NDCG@k**, i.e. like recall but rank of top-$k$ recommendation is weighted

Compare using relative gain, i.e. recall@10 from 10% tot 20% is a 100% gain (20-10/10 * 100)

In [55]:
import scipy.sparse

#Create scipy csr matrix
def create_sparse_matrix(sessions_df, column='train', shape=None):
  #flatten
  user_ids = []
  item_ids = []
  for idx, row in sessions_df.iterrows():
    items = row[column]
    user = row['user_id']
    user_ids.extend([user] * len(items))
    item_ids.extend(items)
  #create csr matrix
  values = np.ones(len(user_ids))
  matrix = scipy.sparse.csr_matrix((values, (user_ids, item_ids)), shape=shape, dtype=np.int32)
  return matrix


shape = (interactions_df_processed['user_id'].max() +1,  interactions_df_processed['item_id'].max() +1)
print(shape)
train_x = create_sparse_matrix(sessions_df, column='train', shape=shape)
y_true = create_sparse_matrix(sessions_df, column='test', shape=shape)
print(train_x)

(1465783, 14513)
  (0, 0)	1
  (0, 3871)	1
  (0, 6589)	1
  (0, 11847)	1
  (1, 3228)	1
  (1, 4392)	1
  (1, 8009)	1
  (1, 10348)	1
  (2, 2)	1
  (2, 135)	1
  (2, 160)	1
  (2, 735)	1
  (2, 1035)	1
  (2, 1284)	1
  (2, 1559)	1
  (2, 2422)	1
  (2, 2537)	1
  (2, 2626)	1
  (2, 2718)	1
  (2, 2846)	1
  (2, 2885)	1
  (2, 3057)	1
  (2, 3131)	1
  (2, 3499)	1
  (2, 3828)	1
  :	:
  (1422904, 13913)	1
  (1422904, 14041)	1
  (1422904, 14229)	1
  (1425457, 13999)	1
  (1425457, 14335)	1
  (1425989, 13997)	1
  (1425989, 14139)	1
  (1427307, 14302)	1
  (1427307, 14465)	1
  (1428166, 13997)	1
  (1428166, 14459)	1
  (1428271, 13586)	1
  (1428271, 14169)	1
  (1428806, 14303)	1
  (1428806, 14512)	1
  (1435954, 14139)	1
  (1435954, 14459)	1
  (1436934, 13911)	1
  (1436934, 14303)	1
  (1441085, 14235)	1
  (1441085, 14303)	1
  (1454134, 14335)	1
  (1454134, 14493)	1
  (1465782, 13586)	1
  (1465782, 13997)	1


In [56]:
#popularity recommender
class Popularity():
  def __init__(self, K=10):
    self.K = K

  def fit(self, X):
    items = list(X.nonzero()[1])
    sorted_scores = Counter(items).most_common()
    self.sorted_scores_ = [
      (item, score / sorted_scores[0][1]) for item, score in sorted_scores
    ]

  def predict(self, X):
    items, values = zip(*self.sorted_scores_[: self.K])

    users = set(X.nonzero()[0])

    U, I, V = [], [], []

    for user in users:
      U.extend([user] * self.K)
      I.extend(items)
      V.extend(values)

    score_matrix = scipy.sparse.csr_matrix((V, (U, I)), shape=X.shape)
    return score_matrix

K = 20
pop = Popularity(K=K)
pop.fit(train_x)
y_pred = pop.predict(train_x)
print(y_pred)

  (0, 48)	0.41013224064071524
  (0, 94)	0.2877630843732539
  (0, 160)	0.6483516483516484
  (0, 440)	0.5027938163531384
  (0, 1367)	0.31234866828087166
  (0, 1622)	0.30387409200968524
  (0, 2204)	0.4041720990873533
  (0, 2621)	0.37595455392065563
  (0, 2678)	0.4811883032222015
  (0, 3004)	0.46479791395045633
  (0, 3716)	0.9021233004283852
  (0, 3783)	0.3388899236356864
  (0, 3998)	0.3067610355745949
  (0, 4176)	0.3113242689513876
  (0, 5339)	0.5815794375116409
  (0, 5747)	0.4938536040230955
  (0, 6769)	0.36775935928478304
  (0, 7063)	1.0
  (0, 9395)	0.32287204321102625
  (0, 9466)	0.3309741106351276
  (1, 48)	0.41013224064071524
  (1, 94)	0.2877630843732539
  (1, 160)	0.6483516483516484
  (1, 440)	0.5027938163531384
  (1, 1367)	0.31234866828087166
  :	:
  (1454134, 5747)	0.4938536040230955
  (1454134, 6769)	0.36775935928478304
  (1454134, 7063)	1.0
  (1454134, 9395)	0.32287204321102625
  (1454134, 9466)	0.3309741106351276
  (1465782, 48)	0.41013224064071524
  (1465782, 94)	0.28776308437

In [57]:
#Evaluate recall@k
#Do elementwise multiplication of top K predicts and true interactions
def sparse_divide_nonzero(a: scipy.sparse.csr_matrix, b: scipy.sparse.csr_matrix) -> scipy.sparse.csr_matrix:
  return a.multiply(sparse_inverse_nonzero(b))

def sparse_inverse_nonzero(a: scipy.sparse.csr_matrix) -> scipy.sparse.csr_matrix:
  inv_a = a.copy()
  inv_a.data = 1 / inv_a.data
  return inv_a

scores = scipy.sparse.lil_matrix(y_pred.shape)
scores[y_pred.multiply(y_true).astype(bool)] = 1
scores = scores.tocsr()
scores = sparse_divide_nonzero(scores, scipy.sparse.csr_matrix(y_true.sum(axis=1))).sum(axis=1)
print("recall @ {}: {:.4f}".format(K, scores.mean()))

recall @ 20: 0.0042


In [None]:
from sklearn.metrics import ndcg_score

ndcg = ndcg_score(y_true.toarray(), y_pred.toarray())

MemoryError: Unable to allocate 79.2 GiB for an array with shape (1465783, 14513) and data type int32