## Super Mario Maker Dataset - Recommender Sy

### 1. Prerequisites

Import required libraries.

In [1]:
#!pip install surprise

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import os
import gc
from sklearn import model_selection
from surprise import NormalPredictor, SVD, KNNBasic, SVDpp
from surprise import Dataset
from surprise import Reader
import surprise
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate

import scipy
from scipy.sparse import csr_matrix


Specify data files location.

In [3]:
data_path = '/Users/pj/Documents/671_Project/'

Read data of games played.

In [4]:
plays_df = pd.read_csv(data_path + 'plays.csv', sep="\t", usecols=['id', 'player'])
plays_df.columns = ['game', 'player']

In [5]:
plays_df.head()

Unnamed: 0,game,player
0,0000-0000-0353-3D35,darter60k
1,0000-0000-0353-3D35,davidchofis
2,0000-0000-0353-3D35,fabioviana
3,0000-0000-0353-3D35,gorigokky
4,0000-0000-0353-3D35,groebenzell


Add a column with value of 0 to start with indicating game is played but not cleared by the player.

In [6]:
plays_df['score'] = 0

Read data of games cleared data.

In [7]:
clears_df = pd.read_csv('/Users/pj/Documents/671_Project/clears.csv', sep='\t', usecols=['id', 'player'])
clears_df.columns = ['game', 'player']

In [8]:
clears_df.head()

Unnamed: 0,game,player
0,0000-0000-0353-3D35,darter60k
1,0000-0000-0353-3D35,davidchofis
2,0000-0000-0353-3D35,fabioviana
3,0000-0000-0353-3D35,groebenzell
4,0000-0000-0353-3D35,igatake0229


Add a column with value of 1 indicating that the game is cleared by the player.

In [9]:
clears_df['score'] = 1

Next we merge the 2 dataframes, so it now has information about games cleared as well as games played, but not cleared.

In [10]:
all_data = clears_df.merge(plays_df, how='outer', on=['game', 'player'])

Replace *NaN* values with $0$.

In [11]:
all_data.fillna(0, inplace=True)

Drop last column *score_y* as it contains only zeroes, all required information is now available in *score_x*. It is $0$ when the player has just played the game, but not cleared. It is $1$ when the player has cleared that game.

In [12]:
all_data.drop(columns='score_y', inplace=True)

Presently, $0$ and $1$ labels are segregated, lets shuffle the dataframe to distribute them.

In [13]:
all_data = all_data.sample(frac=1).reset_index(drop=True)

In [14]:
train_data, test_data = model_selection.train_test_split(all_data, test_size = 0.2)
val_data, test_data   = model_selection.train_test_split(test_data, test_size = 0.5)

### 2. Exploratory Data Analysis.

Let us check some data attributes before building the recommendation system.

In [15]:
print('Total number of players : {}'.format(len(all_data['player'].value_counts())))
print('Total number of games   : {}'.format(len(all_data['game'].value_counts())))

Total number of players : 861465
Total number of games   : 115032


In [16]:
print('Number of games played and cleared     : {}'.format(len(all_data[all_data['score_x'] == 1])))
print('Number of games played but not cleared : {}'.format(len(all_data[all_data['score_x'] == 0])))

Number of games played and cleared     : 2051809
Number of games played but not cleared : 2036348


**Comment:** As we can observed from output of above cell, there is approximately $50:50$ split between game levels cleared and games only played.

### 3. Recommendation system data preparation.

In [17]:
reader = Reader(rating_scale=(0, 1))

Load train, validation and test data from dataframes.

In [18]:
rec_data   = Dataset.load_from_df(train_data[['player', 'game', 'score_x']], reader)
rec_data_v = Dataset.load_from_df(val_data[['player', 'game', 'score_x']], reader)
rec_data_t = Dataset.load_from_df(test_data[['player', 'game', 'score_x']], reader)

Train/Val/Test split = $80:10:10$.

In [19]:
train_set , _    = surprise.model_selection.train_test_split(rec_data, test_size=0.0001) # 0.01% data is discarded, bcoz it does not allow test_size to be 0.
_ , val_set      = surprise.model_selection.train_test_split(rec_data_v, test_size=1.0)
_ , test_set     = surprise.model_selection.train_test_split(rec_data_t, test_size=1.0)

In [20]:
#train_set.split(n_folds=5)

### 4. Baseline recommendation system.

We will use normal predictor as baseline recommender. It is an algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.

The prediction $\hat r_{ui}$ is generated from a normal distribution $N(\hat \mu, \hat \sigma)$ where $\hat \mu$ and $\hat \sigma$ are estimated from the training data using Maximum Likelihood Estimation:

$ \hat \mu=\frac{1}{R_{train}} \sum_{r_{ui} \in R_{train}}
{r_{ui}}$

$\hat \sigma = \sqrt{\sum_{r_{ui} \in R_{train}} \frac{(r_{ui}-{\hat \mu})^2}{|R_{train}|}}$

In [20]:
algo_normal = NormalPredictor()

In [21]:
cross_validate(algo_normal, rec_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6156  0.6155  0.6158  0.6159  0.6151  0.6156  0.0003  
MAE (testset)     0.5000  0.5000  0.5003  0.5004  0.4994  0.5000  0.0003  
Fit time          5.08    7.72    7.06    8.41    7.53    7.16    1.12    
Test time         9.38    7.59    9.55    9.44    8.45    8.88    0.76    


{'test_rmse': array([0.6156369 , 0.61549122, 0.61583449, 0.61589171, 0.61508054]),
 'test_mae': array([0.49995859, 0.5000054 , 0.5003468 , 0.50036926, 0.49942038]),
 'fit_time': (5.0842201709747314,
  7.719686985015869,
  7.062654972076416,
  8.406746864318848,
  7.532561302185059),
 'test_time': (9.378026008605957,
  7.591669082641602,
  9.549007654190063,
  9.43704605102539,
  8.44702696800232)}

In [23]:
algo_normal.fit(train_set)

In [24]:
pred_normal = algo_normal.test(val_set)

In [25]:
rmse_normal = accuracy.rmse(pred_normal)
mse_normal = accuracy.mse(pred_normal)

In [26]:
print("Validation RMSE using Normal Predictor: {0:5.4f}".format(rmse_normal))
print("Validation MSE using Normal Predictor : {0:5.4f}".format(mse_normal))

### 4. Using kNN

In [27]:
algo_knn = KNNBasic()

In [None]:
cross_validate(algo_knn, rec_data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

### 5. Recommendation system using SVD

In [22]:
algo_svd = SVD()

In [23]:
algo_svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11f141d90>

In [24]:
pred_svd = algo_svd.test(val_set)

In [25]:
rmse_svd = accuracy.rmse(pred_svd)
mse_svd  = accuracy.mse(pred_svd)

RMSE: 0.4032
MSE: 0.1626


In [26]:
print("Validation RMSE using SVD : {0:5.4f}".format(rmse_svd))
print("Validation MSE using SVD  : {0:5.4f}".format(mse_svd))

Validation RMSE using SVD : 0.4032
Validation MSE using SVD  : 0.1626


In [27]:
cross_validate(algo_svd, rec_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4086  0.4085  0.4085  0.4087  0.4085  0.4086  0.0001  
MAE (testset)     0.3452  0.3451  0.3449  0.3451  0.3449  0.3451  0.0001  
Fit time          187.19  185.82  185.95  182.31  194.44  187.14  3.99    
Test time         9.35    8.37    8.64    5.57    8.51    8.09    1.30    


{'test_rmse': array([0.40864899, 0.40852473, 0.40846658, 0.40866519, 0.40854241]),
 'test_mae': array([0.34516982, 0.3451255 , 0.3449396 , 0.34511894, 0.34490493]),
 'fit_time': (187.19080424308777,
  185.8187770843506,
  185.94905877113342,
  182.306006193161,
  194.43745374679565),
 'test_time': (9.348040103912354,
  8.373914003372192,
  8.636219024658203,
  5.566976070404053,
  8.507369995117188)}