# 0. Import packages

In [1]:
import numpy as np
import pandas as pd
import sklearn
import gzip
import json
from tqdm import tqdm
import os
from collections import Counter
from datetime import datetime
import math
tqdm.pandas() #for progres_apply etc.

In [2]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
  #read gzipped content
  f=gzip.open(filename_gzipped_python_json,'r')
  
  #parse json
  parse_data = []
  for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
    line = line.decode('utf-8')
    line = line.replace('true','True') #difference json/python
    line = line.replace('false','False')
    parsed_result = eval(line) #load python nested datastructure
    parse_data.append(parsed_result)
    if read_max !=-1 and len(parse_data) > read_max:
      print(f'Break reading after {read_max} records')
      break
  print(f"Reading {len(parse_data)} rows.")

  #create dataframe
  df= pd.DataFrame.from_dict(parse_data)
  return df

# 1. Load Steam data

In [1]:
steam_path = 'data/'
metadata_games = 'steam_games.json.gz' 
user_items = 'australian_users_items.json.gz'
user_reviews = 'australian_user_reviews.json.gz'
game_bundles = 'bundle_data.json.gz'
steam_reviews= 'steam_reviews.json.gz'

In [4]:
dataframes = []
for dataset in [ user_reviews, steam_reviews]:
# for dataset in [ metadata_games, user_items, user_reviews, game_bundles, steam_reviews]:
  print(f"----- {dataset}-----")
  size = os.path.getsize(steam_path + dataset) 
  print(f'Size of file is {size / 1000000}MB')
  df_metadata = parse_json(steam_path + dataset)
  dataframes.append(df_metadata)
  pd.set_option('display.max_colwidth', None)
  display(df_metadata.head(2))
  #display(df_metadata.describe(include='all'))

----- australian_user_reviews.json.gz-----
Size of file is 6.940139MB


25799it [00:01, 14557.32it/s]

Reading 25799 rows.





Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.'}, {'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'It's unique and worth a playthrough.'}, {'funny': '', 'posted': 'Posted April 21, 2011.', 'last_edited': '', 'item_id': '43110', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}]"
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014.', 'last_edited': '', 'item_id': '251610', 'helpful': '15 of 20 people (75%) found this review helpful', 'recommend': True, 'review': 'I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what True fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8'}, {'funny': '', 'posted': 'Posted September 8, 2013.', 'last_edited': '', 'item_id': '227300', 'helpful': '0 of 1 people (0%) found this review helpful', 'recommend': True, 'review': 'For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.'}, {'funny': '', 'posted': 'Posted November 29, 2013.', 'last_edited': '', 'item_id': '239030', 'helpful': '1 of 4 people (25%) found this review helpful', 'recommend': True, 'review': 'Very fun little game to play when your bored or as a time passer. Very gud. Do Recommend. pls buy'}]"


----- steam_reviews.json.gz-----
Size of file is 1350.067901MB


7793069it [05:05, 25546.70it/s]


Reading 7793069 rows.


Unnamed: 0,username,product_id,page_order,text,hours,recommended,products,date,early_access,page,compensation,found_funny,user_id
0,Chaos Syren,725280,0,This would not be acceptable as an entertainment even back in the day when these graphics were all there was to be had. No effort has been made to bring the player into any story or even entertain.,0.1,True,41.0,2017-12-17,False,1,,,
1,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,0,looks like a facebook game,51.1,True,769.0,2017-12-27,False,1,,,


In [5]:
user_reviews_df = dataframes[0]
user_reviews_df = user_reviews_df.explode("reviews").reset_index(drop=True)
user_reviews_df

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare.'}"
1,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"{'funny': '', 'posted': 'Posted July 15, 2011.', 'last_edited': '', 'item_id': '22200', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'It's unique and worth a playthrough.'}"
2,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"{'funny': '', 'posted': 'Posted April 21, 2011.', 'last_edited': '', 'item_id': '43110', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}"
3,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014.', 'last_edited': '', 'item_id': '251610', 'helpful': '15 of 20 people (75%) found this review helpful', 'recommend': True, 'review': 'I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what True fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8'}"
4,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2013.', 'last_edited': '', 'item_id': '227300', 'helpful': '0 of 1 people (0%) found this review helpful', 'recommend': True, 'review': 'For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it.'}"
...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,"{'funny': '', 'posted': 'Posted July 10.', 'last_edited': '', 'item_id': '70', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'a must have classic from steam definitely worth buying.'}"
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,"{'funny': '', 'posted': 'Posted July 8.', 'last_edited': '', 'item_id': '362890', 'helpful': 'No ratings yet', 'recommend': True, 'review': 'this game is a perfect remake of the original half life. personally one of the best remakes i have played in a long time. there are a few changes in the remake but for the most part its almost the same as the original half life.the game still needs Xen to be completed but all the other chapters are ready for you to play and enjoy. i say buy this game if you loved the original half life. but avoid it if you can t wait for xen to be completed.'}"
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '1 person found this review funny', 'posted': 'Posted July 3.', 'last_edited': '', 'item_id': '273110', 'helpful': '1 of 2 people (50%) found this review helpful', 'recommend': True, 'review': 'had so much fun plaing this and collecting resources xD we won on my first try and killed final boss!'}"
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '', 'posted': 'Posted July 20.', 'last_edited': '', 'item_id': '730', 'helpful': 'No ratings yet', 'recommend': True, 'review': ':D'}"


In [6]:
user_reviews_df = pd.concat([user_reviews_df.drop(['reviews'], axis=1), user_reviews_df['reviews'].progress_apply(pd.Series)], axis=1)
user_reviews_df

100%|███████████████████████████████████| 59333/59333 [00:17<00:00, 3314.35it/s]


Unnamed: 0,user_id,user_url,0,funny,helpful,item_id,last_edited,posted,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,,,No ratings yet,1250,,"Posted November 5, 2011.",True,"Simple yet with great replayability. In my opinion does ""zombie"" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth ""zombie"" splattering fun for the whole family. Amazed this sort of FPS is so rare."
1,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,,,No ratings yet,22200,,"Posted July 15, 2011.",True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,,,No ratings yet,43110,,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!
3,js41637,http://steamcommunity.com/id/js41637,,,15 of 20 people (75%) found this review helpful,251610,,"Posted June 24, 2014.",True,"I know what you think when you see this title ""Barbie Dreamhouse Party"" but do not be intimidated by it's title, this is easily one of my GOTYs. You don't get any of that cliche game mechanics that all the latest games have, this is simply good core gameplay. Yes, you can't 360 noscope your friends, but what you can do is show them up with your bad ♥♥♥ dance moves and put them to shame as you show them what True fashion and color combinations are.I know this game says for kids but, this is easily for any age range and any age will have a blast playing this.8/8"
4,js41637,http://steamcommunity.com/id/js41637,,,0 of 1 people (0%) found this review helpful,227300,,"Posted September 8, 2013.",True,"For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is suprisingly a nice truck driving game and I had a bit of fun with it."
...,...,...,...,...,...,...,...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,,,No ratings yet,70,,Posted July 10.,True,a must have classic from steam definitely worth buying.
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,,,No ratings yet,362890,,Posted July 8.,True,this game is a perfect remake of the original half life. personally one of the best remakes i have played in a long time. there are a few changes in the remake but for the most part its almost the same as the original half life.the game still needs Xen to be completed but all the other chapters are ready for you to play and enjoy. i say buy this game if you loved the original half life. but avoid it if you can t wait for xen to be completed.
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,1 person found this review funny,1 of 2 people (50%) found this review helpful,273110,,Posted July 3.,True,had so much fun plaing this and collecting resources xD we won on my first try and killed final boss!
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,No ratings yet,730,,Posted July 20.,True,:D


In [7]:
#user_reviews_df.loc[user_reviews_df['recommend'] == False]
user_reviews_df = user_reviews_df[["user_id", "item_id", "recommend"]]
user_reviews_df

Unnamed: 0,user_id,item_id,recommend
0,76561197970982479,1250,True
1,76561197970982479,22200,True
2,76561197970982479,43110,True
3,js41637,251610,True
4,js41637,227300,True
...,...,...,...
59328,76561198312638244,70,True
59329,76561198312638244,362890,True
59330,LydiaMorley,273110,True
59331,LydiaMorley,730,True


In [8]:
steam_reviews_df = dataframes[1]
steam_reviews_df = steam_reviews_df[["user_id", "product_id", "recommended"]]
steam_reviews_df = steam_reviews_df.rename(columns={'product_id': "item_id", "recommended": 'recommend'})
steam_reviews_df

Unnamed: 0,user_id,item_id,recommend
0,,725280,True
1,,328100,True
2,,328100,True
3,,35140,True
4,76561198007483075,35140,True
...,...,...,...
7793064,,252490,True
7793065,76561198089897928,252490,True
7793066,76561198048207033,252490,True
7793067,,252490,True


In [9]:
# df = user_reviews_df.append(steam_reviews_df)
interactions_df = steam_reviews_df
interactions_df = interactions_df.dropna()
interactions_df

Unnamed: 0,user_id,item_id,recommend
4,76561198007483075,35140,True
8,76561197970402776,707610,True
11,76561198060686749,328100,True
13,76561198023491401,35140,True
16,76561198115331805,35140,True
...,...,...,...
7793058,76561197962161824,252490,True
7793061,76561198010660367,252490,True
7793062,76561197983773018,252490,True
7793065,76561198089897928,252490,True


# 4. Pre-process interactions
- Drop reconsumption items
- Remove items with fewer than x interactions
- Remove users with fewer than x interactions

In [10]:
def preprocess_classic(df, minsup=5):
  """
  Goal: - Remove reconsumption items
        - Remove items that have less than minsup interactions
        - Remove users that have less than minsup interactions

  :input df: Dataframe containing user_id, item_id and time
  """
  before = df.shape[0]
  #drop reconsumption items
  df = df.drop_duplicates(subset=["user_id","item_id"])
  print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
  #compute item/user counts
  g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
  g1 = g1.rename({'size': 'count_item'}, axis='columns')
  g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
  g2 = g2.rename({'size': 'count_user'}, axis='columns')
  df = pd.merge(df, g1, how='left', on=['item_id'])
  df = pd.merge(df, g2, how='left', on=['user_id'])
  display(df.head(5))
  #drop items occurring less than minsup times
  before = df.shape[0]
  df = df[df['count_item'] >= minsup]
  print("After dropping items with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
  before = df.shape[0]
  #drop users with less then minsup items in history
  df = df[df['count_user'] >= minsup]
  df = df[['user_id','item_id', "recommend"]]
  print("After dropping users with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
  return df

#Remark: ignoring rating, considering all reviews as implicit possitive feedback
#print number of users and items
interactions_df_processed = interactions_df[['user_id','item_id', "recommend"]]
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")
interactions_df_processed = preprocess_classic(interactions_df_processed)
display(interactions_df_processed.head(5))
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

number of unique users: 1485611
number of unique items: 14513
After drop_duplicates (reconsumption items): 3176223 -> 2832522


Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,76561198007483075,35140,True,3173,6
1,76561197970402776,707610,True,1,7
2,76561198060686749,328100,True,22,55
3,76561198023491401,35140,True,3173,109
4,76561198115331805,35140,True,3173,1


After dropping items with less than 5 interactions: 2832522 -> 2823984
After dropping users with less than 5 interactions: 2823984 -> 901061


Unnamed: 0,user_id,item_id,recommend
0,76561198007483075,35140,True
2,76561198060686749,328100,True
3,76561198023491401,35140,True
7,76561198011965365,328100,True
11,76561197999294964,506510,True


number of unique users: 92428
number of unique items: 10551


In [11]:
interactions_df_processed

Unnamed: 0,user_id,item_id,recommend
0,76561198007483075,35140,True
2,76561198060686749,328100,True
3,76561198023491401,35140,True
7,76561198011965365,328100,True
11,76561197999294964,506510,True
...,...,...,...
2832480,76561198000361299,252490,True
2832485,76561198068325674,252490,True
2832502,76561198039073768,252490,True
2832508,76561197970622242,252490,True


In [12]:
dct = {}
def map_to_consecutive_id(uuid):
  if uuid in dct:
    return dct[uuid]
  else:
    id = len(dct)
    dct[uuid] = id
    return id

#1) convert user user_ids to consecutive integer ID's
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)

interactions_df_processed.head()

interactions_df_processed = interactions_df_processed[["user_id_int", "item_id_int", "recommend"]]
interactions_df_processed = interactions_df_processed.rename(columns={"item_id_int": "item_id", "user_id_int": "user_id"})
interactions_df_processed

100%|███████████████████████████████| 901061/901061 [00:01<00:00, 680241.61it/s]
100%|██████████████████████████████| 901061/901061 [00:00<00:00, 1284583.75it/s]


Unnamed: 0,user_id,item_id,recommend
0,0,0,True
2,1,1,True
3,2,0,True
7,3,1,True
11,4,2,True
...,...,...,...
2832480,33664,10175,True
2832485,19272,10175,True
2832502,37868,10175,True
2832508,79147,10175,True


In [13]:
import pickle
pickle.dump(interactions_df_processed, open("interactions.p", "wb"))

In [2]:
import pickle
interactions_df_processed = pickle.load(open("interactions.p", "rb"))

# 5. Create train/test split
Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [3]:
#Session-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id', "recommend"]].agg(list)
display(sessions_df.head(10))

def split(items, percentage_train):
  no_train_items = math.floor(len(items) * percentage_train)
  return items[0:no_train_items], items[no_train_items:]

percentage_train = 0.8
sessions_df['train'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[0])
sessions_df['test'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[1])

sessions_df.head(10)

Unnamed: 0,user_id,item_id,recommend
0,0,"[0, 2837, 4830, 8609, 9214, 10541]","[True, True, True, True, True, True]"
1,1,"[1, 100, 546, 769, 941, 116, 1138, 1770, 1852,...","[True, True, True, True, True, True, True, Tru..."
2,2,"[0, 100, 212, 359, 451, 536, 666, 757, 891, 10...","[True, True, True, True, True, True, True, Tru..."
3,3,"[1, 13, 18, 36, 34, 65, 68, 74, 90, 95, 115, 1...","[True, True, True, True, True, True, True, Tru..."
4,4,"[2, 165, 512, 194, 812, 871, 1499, 2101, 2318,...","[True, True, True, True, True, True, True, Tru..."
5,5,"[0, 1954, 3174, 5947, 6161, 8520, 8747, 10176,...","[True, True, True, True, True, True, True, Tru..."
6,6,"[3, 37, 39, 63, 45, 104, 123, 162, 194, 224, 2...","[True, True, True, True, True, True, True, Tru..."
7,7,"[4, 33, 529, 515, 1166, 1190, 1331, 1614, 2767...","[True, True, True, True, True, True, True, Tru..."
8,8,"[4, 2117, 2425, 2610, 2601, 2723, 3620, 5194, ...","[True, True, True, True, True, True, True, Tru..."
9,9,"[5, 519, 854, 1248, 1448, 1853, 2414, 2723, 32...","[True, True, True, True, True, True, True, Tru..."


Unnamed: 0,user_id,item_id,recommend,train,test
0,0,"[0, 2837, 4830, 8609, 9214, 10541]","[True, True, True, True, True, True]","[0, 2837, 4830, 8609]","[9214, 10541]"
1,1,"[1, 100, 546, 769, 941, 116, 1138, 1770, 1852,...","[True, True, True, True, True, True, True, Tru...","[1, 100, 546, 769, 941, 116, 1138, 1770, 1852,...","[6611, 6592, 7422, 6869, 7905, 8079, 8788, 928..."
2,2,"[0, 100, 212, 359, 451, 536, 666, 757, 891, 10...","[True, True, True, True, True, True, True, Tru...","[0, 100, 212, 359, 451, 536, 666, 757, 891, 10...","[8918, 9045, 9087, 9175, 9199, 9227, 9430, 945..."
3,3,"[1, 13, 18, 36, 34, 65, 68, 74, 90, 95, 115, 1...","[True, True, True, True, True, True, True, Tru...","[1, 13, 18, 36, 34, 65, 68, 74, 90, 95, 115, 1...","[8583, 8520, 8600, 8608, 8607, 8609, 8622, 861..."
4,4,"[2, 165, 512, 194, 812, 871, 1499, 2101, 2318,...","[True, True, True, True, True, True, True, Tru...","[2, 165, 512, 194, 812, 871, 1499, 2101, 2318]","[4011, 5906, 9187]"
5,5,"[0, 1954, 3174, 5947, 6161, 8520, 8747, 10176,...","[True, True, True, True, True, True, True, Tru...","[0, 1954, 3174, 5947, 6161, 8520, 8747]","[10176, 10296]"
6,6,"[3, 37, 39, 63, 45, 104, 123, 162, 194, 224, 2...","[True, True, True, True, True, True, True, Tru...","[3, 37, 39, 63, 45, 104, 123, 162, 194, 224, 2...","[8857, 8869, 8879, 8894, 8825, 8921, 8926, 896..."
7,7,"[4, 33, 529, 515, 1166, 1190, 1331, 1614, 2767...","[True, True, True, True, True, True, True, Tru...","[4, 33, 529, 515, 1166, 1190, 1331, 1614, 2767...","[7609, 7890, 9581, 10117, 10176, 10404, 10550]"
8,8,"[4, 2117, 2425, 2610, 2601, 2723, 3620, 5194, ...","[True, True, True, True, True, True, True, Tru...","[4, 2117, 2425, 2610, 2601, 2723, 3620]","[5194, 8420]"
9,9,"[5, 519, 854, 1248, 1448, 1853, 2414, 2723, 32...","[True, True, True, True, True, True, True, Tru...","[5, 519, 854, 1248, 1448, 1853, 2414, 2723, 32...","[6016, 6410, 9199, 10117]"


# 6. Evaluate quantitatively
Options are:
- **Hitrate@k**, i.e. percentage of users where top-$k$ recommendations is relevant
- **Recall@k**, i.e. percentage of top-$k$ recommendation that are relevant
- **NDCG@k**, i.e. like recall but rank of top-$k$ recommendation is weighted

Compare using relative gain, i.e. recall@10 from 10% tot 20% is a 100% gain (20-10/10 * 100)

In [4]:
import scipy.sparse

#Create scipy csr matrix
def create_sparse_matrix(sessions_df, column='train', shape=None):
  #flatten
  user_ids = []
  item_ids = []
  for idx, row in sessions_df.iterrows():
    items = row[column]
    user = row['user_id']
    user_ids.extend([user] * len(items))
    item_ids.extend(items)
  #create csr matrix
  values = np.ones(len(user_ids))
  matrix = scipy.sparse.csr_matrix((values, (user_ids, item_ids)), shape=shape, dtype=np.int32)
  return matrix


shape = (interactions_df_processed['user_id'].max() +1,  interactions_df_processed['item_id'].max() +1)
print(shape)
train_x = create_sparse_matrix(sessions_df, column='train', shape=shape)
y_true = create_sparse_matrix(sessions_df, column='test', shape=shape)
print(train_x)

(92428, 10551)
  (0, 0)	1
  (0, 2837)	1
  (0, 4830)	1
  (0, 8609)	1
  (1, 1)	1
  (1, 100)	1
  (1, 116)	1
  (1, 546)	1
  (1, 769)	1
  (1, 941)	1
  (1, 1138)	1
  (1, 1770)	1
  (1, 1852)	1
  (1, 1925)	1
  (1, 1982)	1
  (1, 2085)	1
  (1, 2111)	1
  (1, 2243)	1
  (1, 2300)	1
  (1, 2561)	1
  (1, 2802)	1
  (1, 2857)	1
  (1, 3087)	1
  (1, 3217)	1
  (1, 3258)	1
  :	:
  (92422, 8907)	1
  (92422, 8948)	1
  (92422, 9734)	1
  (92422, 9844)	1
  (92422, 9965)	1
  (92423, 8825)	1
  (92423, 9084)	1
  (92423, 9323)	1
  (92423, 9830)	1
  (92424, 8907)	1
  (92424, 9099)	1
  (92424, 9982)	1
  (92424, 10070)	1
  (92425, 9226)	1
  (92425, 9596)	1
  (92425, 9609)	1
  (92425, 9671)	1
  (92426, 9657)	1
  (92426, 9836)	1
  (92426, 10002)	1
  (92426, 10277)	1
  (92427, 10054)	1
  (92427, 10345)	1
  (92427, 10422)	1
  (92427, 10522)	1


In [5]:
#popularity recommender
class Popularity():
  def __init__(self, K=10):
    self.K = K

  def fit(self, X):
    items = list(X.nonzero()[1])
    sorted_scores = Counter(items).most_common()
    self.sorted_scores_ = [
      (item, score / sorted_scores[0][1]) for item, score in sorted_scores
    ]

  def predict(self, X):
    items, values = zip(*self.sorted_scores_[: self.K])

    users = set(X.nonzero()[0])

    U, I, V = [], [], []

    for user in users:
      U.extend([user] * self.K)
      I.extend(items)
      V.extend(values)

    score_matrix = scipy.sparse.csr_matrix((V, (U, I)), shape=X.shape)
    return score_matrix

K = 100
pop = Popularity(K=K)
pop.fit(train_x)
y_pred = pop.predict(train_x)
print(y_pred)

  (0, 0)	0.24233128834355827
  (0, 33)	0.37661895023858216
  (0, 73)	0.29703476482617586
  (0, 100)	0.2634628493524199
  (0, 116)	0.6540558963871848
  (0, 119)	0.17007498295841855
  (0, 194)	0.32498295841854125
  (0, 238)	0.30214723926380366
  (0, 324)	0.5509543285616906
  (0, 422)	0.18711656441717792
  (0, 854)	0.26942740286298567
  (0, 863)	0.1663258350374915
  (0, 1005)	0.3771301976823449
  (0, 1006)	0.17467620995228358
  (0, 1064)	0.2065439672801636
  (0, 1108)	0.18779822767552828
  (0, 1166)	0.18404907975460122
  (0, 1190)	0.29226312201772325
  (0, 1316)	0.29720518064076346
  (0, 1317)	0.31441717791411045
  (0, 1543)	0.32515337423312884
  (0, 1570)	0.2834014996591684
  (0, 1614)	0.3832651670074983
  (0, 1676)	0.2119972733469666
  (0, 1798)	0.2091002044989775
  :	:
  (92427, 5770)	0.20824812542603954
  (92427, 5791)	0.22869802317655077
  (92427, 5809)	0.24573960463531017
  (92427, 5823)	0.22324471710974778
  (92427, 5825)	0.2339809134287662
  (92427, 5858)	0.20756646216768918
  (92

In [6]:
#Evaluate recall@k
#Do elementwise multiplication of top K predicts and true interactions
def sparse_divide_nonzero(a: scipy.sparse.csr_matrix, b: scipy.sparse.csr_matrix) -> scipy.sparse.csr_matrix:
  return a.multiply(sparse_inverse_nonzero(b))

def sparse_inverse_nonzero(a: scipy.sparse.csr_matrix) -> scipy.sparse.csr_matrix:
  inv_a = a.copy()
  inv_a.data = 1 / inv_a.data
  return inv_a

scores = scipy.sparse.lil_matrix(y_pred.shape)
scores[y_pred.multiply(y_true).astype(bool)] = 1
scores = scores.tocsr()
scores = sparse_divide_nonzero(scores, scipy.sparse.csr_matrix(y_true.sum(axis=1))).sum(axis=1)
print("recall @ {}: {:.4f}".format(K, scores.mean()))

recall @ 100: 0.0618


In [7]:
#from sklearn.metrics import ndcg_score

#ndcg = ndcg_score(y_true.toarray(), y_pred.toarray())

## Algorithm

In [5]:
from rules import Rules, Rule
from condset import Condset

In [6]:
b = Rules()
b.numrules

0

In [11]:
print(train_x)

  (0, 0)	1
  (0, 2837)	1
  (0, 4830)	1
  (0, 8609)	1
  (1, 1)	1
  (1, 100)	1
  (1, 116)	1
  (1, 546)	1
  (1, 769)	1
  (1, 941)	1
  (1, 1138)	1
  (1, 1770)	1
  (1, 1852)	1
  (1, 1925)	1
  (1, 1982)	1
  (1, 2085)	1
  (1, 2111)	1
  (1, 2243)	1
  (1, 2300)	1
  (1, 2561)	1
  (1, 2802)	1
  (1, 2857)	1
  (1, 3087)	1
  (1, 3217)	1
  (1, 3258)	1
  :	:
  (92422, 8907)	1
  (92422, 8948)	1
  (92422, 9734)	1
  (92422, 9844)	1
  (92422, 9965)	1
  (92423, 8825)	1
  (92423, 9084)	1
  (92423, 9323)	1
  (92423, 9830)	1
  (92424, 8907)	1
  (92424, 9099)	1
  (92424, 9982)	1
  (92424, 10070)	1
  (92425, 9226)	1
  (92425, 9596)	1
  (92425, 9609)	1
  (92425, 9671)	1
  (92426, 9657)	1
  (92426, 9836)	1
  (92426, 10002)	1
  (92426, 10277)	1
  (92427, 10054)	1
  (92427, 10345)	1
  (92427, 10422)	1
  (92427, 10522)	1


In [162]:
k = train_x
print(k.getcol(0))

  (0, 0)	1
  (2, 0)	1
  (3, 0)	1
  (5, 0)	1
  (21, 0)	1
  (31, 0)	1
  (34, 0)	1
  (35, 0)	1
  (62, 0)	1
  (64, 0)	1
  (68, 0)	1
  (73, 0)	1
  (74, 0)	1
  (76, 0)	1
  (81, 0)	1
  (94, 0)	1
  (96, 0)	1
  (98, 0)	1
  (125, 0)	1
  (127, 0)	1
  (135, 0)	1
  (136, 0)	1
  (174, 0)	1
  (191, 0)	1
  (211, 0)	1
  :	:
  (12666, 0)	1
  (12673, 0)	1
  (12680, 0)	1
  (12683, 0)	1
  (12685, 0)	1
  (12686, 0)	1
  (12687, 0)	1
  (12744, 0)	1
  (12755, 0)	1
  (12759, 0)	1
  (12781, 0)	1
  (12782, 0)	1
  (12823, 0)	1
  (12825, 0)	1
  (12826, 0)	1
  (12828, 0)	1
  (12857, 0)	1
  (12876, 0)	1
  (12938, 0)	1
  (12939, 0)	1
  (12941, 0)	1
  (12954, 0)	1
  (13067, 0)	1
  (13069, 0)	1
  (13070, 0)	1


In [9]:
q = train_x.getrow(0)
t = scipy.sparse.csc_matrix(([1, 1, 1, 1],([0, 1, 1, 1],[4830, 8609, 0, 8])), shape=(2,q.shape[1]))
print(q.shape)
print(q)
print(t.shape)
print(t)
print("--- output ---")

for i in range(t.shape[0]):
    print(f"Number of items in session {i}:", t.getrow(i).nnz)
    print(f"Number of matches for session {i}:", q.multiply(t.getrow(i)).astype(bool).sum())
    
    
print(q.multiply(t))

(1, 10551)
  (0, 0)	1
  (0, 2837)	1
  (0, 4830)	1
  (0, 8609)	1
(2, 10551)
  (1, 0)	1
  (1, 8)	1
  (0, 4830)	1
  (1, 8609)	1
--- output ---
Number of items in session 0: 1
Number of matches for session 0: 1
Number of items in session 1: 3
Number of matches for session 1: 2
  (1, 0)	1
  (0, 4830)	1
  (1, 8609)	1


In [6]:
%load_ext line_profiler

In [27]:
def freq1condset(transactions, targetItem, minsup):
    condsets = []
    consup_list = transactions.getnnz(0)
    for i in tqdm(range(transactions.shape[1])):
        condsup = consup_list[i]

        if condsup < minsup:
            continue
            
            
        tmp = scipy.sparse.csr_matrix((np.ones(len([i] + [targetItem])), (np.zeros(len([i] + [targetItem])), [i] + [targetItem])), shape=(1,transactions.shape[1]))

        rulesup = transactions.dot(tmp.transpose())
        rulesup = rulesup.multiply(rulesup == 2) 
        rulesup.eliminate_zeros()
        rulesup = rulesup.nnz
    
        if rulesup > 0:
            print(rulesup)
    
        if rulesup > minsup:
            c = Condset([i])
            c.condsupCount = condsup
            c.rulesupCount = rulesup
            condsets.append(c)
        
    return condsets




In [56]:
freq_condset = freq1condset(train_x, 8, 20)


 30%|███       | 3166/10551 [00:07<00:18, 409.41it/s]


KeyboardInterrupt: 

In [None]:
i = 1
# print(freq_condset[i].items, freq_condset[i].condsupCount)
print(freq_condset[i].items, freq_condset[i].condsupCount, freq_condset[i].rulesupCount)

In [None]:
%lprun -f freq1condset freq1condset(train_x.transpose(), 8, 20)

In [None]:
print(len(req_sets))

In [7]:
def genRules(condsets, targetItem, maxNumrules, minsup):
    R = Rules()
    for condset in condsets:
        if condset.rulesupCount > minsup:
            R.rules.append(Rule(condset, targetItem))
            if len(R) >= maxNumrules:
                R.aboveMaxRulenumFlag = True
                return R
    
    return R

In [8]:
for i in range(5):
    for j in range(i+1, 5):
        print(i, j)

0 1
0 2
0 3
0 4
1 2
1 3
1 4
2 3
2 4
3 4


In [37]:
"""
(145), (247) -> (1245) (1457) (1247) (2457)
"""

def candidateGEN(Fk):
    candidate_conds = []
    
    for i in range(len(Fk)):
        for j in range(i+1, len(Fk)):
            item_i = Fk[i].items
            item_j = Fk[j].items

            if item_i[:-1] == item_j[:-1]:
                new_condset = item_i + [item_j[-1]]
                candidate_conds.append(Condset(new_condset))
                
    return candidate_conds

In [38]:
train_x.transpose().getrow(1).toarray()

array([[0, 1, 0, ..., 0, 0, 0]], dtype=int32)

In [39]:
def condsetsInt(condsets, transaction, transactions):
    # loop over condsets
    # if condset in transaction, store condset in list
    conds_in_t = []
    
    t = transactions.getrow(transaction).toarray()
    for c in condsets:
        if c in t:
            conds_in_t.append(c)
            
    return conds_in_t
    

In [40]:
def frequentConds(Ck):
    pass

In [41]:
def maxNumrulesWithSup(maxNumrules, R, R1):
    pass

In [60]:
def setMinsupportCount(transactions, targetItem):
    pass

In [61]:
def ASARM2(T, targetItem, minConf, minNumRules, maxNumRules, minsupCount):
    F1 = freq1condset(T, targetItem, minsupCount)
    R = genRules(F1, targetItem, maxNumRules, minsupCount)
    
    k = 2
    Fk = F1

    print(len(Fk))
    
    while len(Fk) > 0 and not R.aboveMaxRulenumFlag:
        Ck = candidateGEN(Fk)
        for t in (range(T.shape[0])):
            print("HIER")
            Ct = condsetsInt(Ck, t, T)
            print(len(Ct))
            
            for c in Ct:
                c.condsupCount += 1
                if targetItem in t:
                    c.rulesupCount += 1
        
        Fk = frequentConds(Ck, minsupCount)
        R1 = genRules(Fk, targetItem, maxNumRules)
        if R.numrules + R1.numrules > maxNumRules or R1.aboveMaxRumenumFlag:
            R.aboveMaxRulenumFlag = True
        R = maxNumrulesWithSup(maxNumRules, R, R1)
        
        k += 1
    
    if R.numrules < minNumRules:
        R.belowMinNumrulesFlag = True
    
    return R

In [73]:
def ASARM1(T, targetItem, minConf, minNumrules, maxNumrules):
#     minsupportCount = setMinsupportCount(T, targetItem)
    minsupportCount = 10

    R = ASARM2(T, targetItem, minConf, minNumrules, maxNumrules, minsupportCount)
    while R.aboveMaxRulenumFlag or R.belowMinNumrulesFlag:
        print(minsupportCount)
        if R.aboveMaxRulenumFlag:
            if minsupportCount == len(T):
                return R.rules
            minsupportCount += 1
            R1 = R
            R = ASARM2(T, targetItem, minConf, minNumrules, maxNumrules, minsupportCount)
            if R.belowMinNumrulesFlag:
                return maxNumrulesWithSup(maxNumrules, R, R1)
        else:
            if minsupportCount == 0:
                return R.rules
            minsupportCount -= 1
            R1 = R
            R = ASARM2(T, targetItem, minConf, minNumrules, maxNumrules, minsupportCount)
            if R.aboveMaxRulenumFlag:
                return maxNumrulesWithSup(maxNumrules, R, R1)
            
    return R.rules
            

In [74]:
r = ASARM1(train_x.transpose(), 578, 0.8, 100, 500)

100%|██████████| 92428/92428 [03:20<00:00, 460.82it/s]


144
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296
HIER
10296


KeyboardInterrupt: 