In [336]:
import pandas as pd
import numpy as np
import time
import os.path as path
from ast import literal_eval
from itertools import product
import matplotlib.pyplot as plt

from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

## KNN model

In [337]:
## read in json files 
ratings_df  = pd.read_csv('../processed_data/ratings.csv')

In [338]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(ratings_df,reader) #load dataset into Surprise datastructure Dataset

In [339]:
ratings_df.drop_duplicates()

Unnamed: 0,user_id,item_id,rating
0,76561197970982479,10,1
1,76561197970982479,20,1
2,76561197970982479,30,1
3,76561197970982479,40,1
4,76561197970982479,50,1
...,...,...,...
5035430,76561198329548331,346330,1
5035431,76561198329548331,373330,1
5035432,76561198329548331,388490,1
5035433,76561198329548331,521570,1


In [340]:
ratings_df.head()

Unnamed: 0,user_id,item_id,rating
0,76561197970982479,10,1
1,76561197970982479,20,1
2,76561197970982479,30,1
3,76561197970982479,40,1
4,76561197970982479,50,1


### Fit the model using full data

In [341]:
# build training data on full dataset.
trainset = data.build_full_trainset()

In [342]:
## Best parameter for KNN model 

sim_options = {
    'name': 'pearson', #similarity measure default is MSD
    'user_based': False, #user-based CF
    'min_support':25
}

In [343]:
#KNN
algo = KNNBasic(sim_options=sim_options,k=10) # number of neighbours = 45
algo.fit(trainset) #fit model to the training set

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x18e13eca0>

In [344]:
# Get list of item ID
item_id_list = ratings_df["item_id"].unique()

In [345]:
item_id_list

array([    10,     20,     30, ..., 354280, 433920, 485270])

In [346]:
## test algo predict
algo.predict('js41637',"10")

Prediction(uid='js41637', iid='10', r_ui=None, est=1.9174700894758845, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

### Recommend using KNN

In [347]:
from collections import defaultdict

known_items = user_item_df[user_item_df.user_id == user_id].sort_values(by="rating", ascending=False).item_id.unique()
def knnRecommendation(userId, topN = 3):
    userRatingList = defaultdict(list)
    for itemId in item_id_list:
        if itemId not in known_items:
            predicted = algo.predict(userId, itemId)
            userRatingList[uid].append((itemId,predicted[3]))
    userRatingList[uid].sort(key = lambda x:x[1], reverse = True)
    
    return userRatingList[uid][:topN]

In [348]:
knnRecommendation('js41637')

[(412450, 4.0), (322680, 4.0), (409590, 4.0)]

## Hybrid Model

### Prepare datasets(user_item_ratings, game_metadata, game_id_name_pairs)

In [349]:
# pip install lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

In [350]:
user_item_df = pd.read_csv("../processed_data/ratings.csv")

In [351]:
print("There are {} unique users and {} unique items in user item ratings".format(user_item_df.user_id.nunique(), user_item_df.item_id.nunique()))

There are 70329 unique users and 10974 unique items in user item ratings


In [352]:
games = pd.read_csv("../processed_data/games_metadata.csv")

In [353]:
# creating output format

all_games = pd.read_csv("../processed_data/all_games_id_name_pair.csv")

In [354]:
user_item_df.item_id.nunique()

10974

### Re-evaluate and training the best model
Mean average precision(MAP)@K and AUC score for train and test

In [355]:
def build_game_features_list(item_features_cols):
    total_features = set()
    game_features_list = []
    for row in games.itertuples(index=False):
        features = set()
        for col in item_features_cols:
            val = getattr(row, col)
            if pd.isnull(val):
                continue
            if col in {"genres", "tags", "specs", "bundles"}:
                features.update(literal_eval(val))
            else:
                features.add(val)
        game_features_list.append(list(features))
        total_features.update(features)
    return game_features_list, list(total_features)

In [356]:
def getGameFeaturesData(item_features_cols):
    game_features_list, total_features = build_game_features_list(item_features_cols)
    game_tuple = list(zip(games.id, game_features_list))
    return (total_features, game_tuple)

In [357]:
# Features1(['genres'])
item_features_cols1 = ['genres']
total_features1, game_tuple1 = getGameFeaturesData(item_features_cols1)
len(total_features1)

22

In [358]:
dataset1 = Dataset()
dataset1.fit(user_item_df['user_id'].unique(), # all the steam users
            all_games["item_id"], # all steam games
            item_features=total_features1)

In [359]:
item_features1 = dataset1.build_item_features(game_tuple1)

In [360]:
(interactions1, weights1) = dataset1.build_interactions([(x[0], x[1]) for x in user_item_df.values])
item_features = item_features1

In [361]:
# data for best model 
interactions=interactions1


In [362]:
# best parameters for the model
best_paramaters = {'no_components': 50, 'loss': 'warp', 'random_state': 1, 'learning_rate': 0.11}

In [363]:
best_model = LightFM(**best_paramaters)

In [364]:
best_model.fit(interactions,
          item_features=item_features,
          epochs=5)

<lightfm.lightfm.LightFM at 0x18e13e910>

### Recommend using Hybrid model

In [375]:
# set the dataset to the final model dataset
dataset = dataset1

In [376]:
# mappings: (user id map, user feature map, item id map, item feature map)
mapping = dataset.mapping()

In [387]:
# series used to use index(internal id) to locate the real id
user_series = pd.Series(mapping[0])
item_series = pd.Series(mapping[2])

In [388]:
def getInternalId(real_id, series):
    return series[real_id]

In [389]:
def getRawId(internal_id, series):
    return series[series == internal_id].index[0]

- Make recommendations

In [390]:
def hybridRecommendataions(user_id, topK = 3):
    print(user_id)
    n_users, n_items = interactions.shape
    internal_user_id = getInternalId(user_id, user_series)
    all_scores = pd.Series(best_model.predict(internal_user_id, np.arange(n_items)))
    all_ranks = list(pd.Series(all_scores.sort_values(ascending=False).index))
    known_items = user_item_df[user_item_df.user_id == user_id].sort_values(by="rating", ascending=False).item_id
    known_items_internal_ids = known_items.apply(getInternalId, args=(item_series,))
    unknown_ranks = [x for x in all_ranks if x not in known_items_internal_ids]
    recommended_list = unknown_ranks[0:10]
    recommended_list_ids = [getRawId(id, item_series) for id in recommended_list]
    known_items_names = all_games.loc[all_games['item_id'].isin(known_items[:10])]
    recommended_items_names = all_games.loc[all_games['item_id'].isin(recommended_list_ids)]
#     print("Known items:")
#     print(known_items_names)
#     print("Recommended items:")
#     print(recommended_items_names)
    return recommended_items_names

### Final model 
Use both KNN and hybrid model for recommendation

In [391]:
def final_model_rs(user_id, topK = 5):
    
    final_list = []
    knn_rs_list = knnRecommendation(user_id, topK)
    knn_rs_df= pd.DataFrame(knn_rs_list, columns = ["item_id", "ratings"])
    hybrid_rs_df = hybridRecommendataions(user_id, topK)
    hybrid_rs_series = hybrid_rs_df["item_id"]
    knn_rs_series = knn_rs_df["item_id"]
    common_rs_games = pd.Series(list(set(knn_rs_series).intersection(set(hybrid_rs_series))))
    for game in common_rs_games:
        final_list.append(game)
    rs_left = topK - common_rs_games.size
    for game in knn_rs_series[:int((rs_left + 1)/2)]:
        final_list.append(game)
    for game in hybrid_rs_series[:int(rs_left/2)]:
        final_list.append(game)
    
    game_list = []
    for game in final_list:
        game_list.append(all_games[all_games["item_id"] == game]["item_name"].values[0])
    return game_list
    

In [392]:
final_model_rs("js41637")

js41637


  common_rs_games = pd.Series(list(set(knn_rs_series).intersection(set(hybrid_rs_series))))


['The Black Death',
 'BLACKHOLE',
 'Kingdoms',
 'Terraria',
 'The Binding of Isaac']

### Popularity model   (for new users)

In [393]:
popularity_df  = pd.read_csv('../processed_data/popularity.csv')

In [394]:
popularity_df.head()

Unnamed: 0,item_id,popularity
0,730,43279
1,4000,42074
2,550,34899
3,304930,34465
4,105600,28541


In [395]:
def rec_popularity(topK = 10):
    popular_list = []
    for game in popularity_df["item_id"].head(topK):
        popular_list.append(all_games[all_games["item_id"] == game]["item_name"].values[0])
    return popular_list

In [396]:
rec_popularity(3)

['Counter-Strike: Global Offensive', "Garry's Mod", 'Left 4 Dead 2']