In [1]:
from math import sqrt

import fastai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')
DPI = 120

In [2]:
game_info_data = pd.read_csv("src/data/game_info.csv")
train_data = pd.read_csv("src/data/user_data_train_no_comments.csv")
test_data = pd.read_csv("src/data/user_data_test_no_comments.csv")
valid_data = pd.read_csv("src/data/user_data_validation_no_comments.csv")

In [3]:
train_data

Unnamed: 0.1,Unnamed: 0,Userscore,Username,Game_ID
0,47246,9,SergeantSoz,119
1,199743,9,SergeantSoz,1540
2,235823,8,SergeantSoz,2227
3,263595,8,SergeantSoz,2941
4,72338,9,tomcrew10,238
...,...,...,...,...
105853,65974,10,Supersaiyan9000,198
105854,69938,10,Supersaiyan9000,218
105855,165164,10,Supersaiyan9000,1063
105856,231077,9,Supersaiyan9000,2137


In [9]:
def format_data(data):
    """
    previously dropped columns and concatenated game id to platform
    now only drops `Unnamed: 0`
    """
    #cp = data.copy(deep=True)
    #cp["Title"] = cp["Title"] + " (" + cp["Platform"] + ")"
    #fmt = cp.drop(["Comment", "Platform", "Unnamed: 0"], axis=1)
    fmt = data.drop(["Unnamed: 0"], axis=1)
    return fmt

def group_data(data):
    """
    dict that maps username -> dataframe containing the user reviews
    """
    return dict(tuple(data.groupby('Username')))


train_data_fmt = format_data(train_data)
test_data_fmt = format_data(test_data)
valid_data_fmt = format_data(valid_data)

train_group = group_data(train_data_fmt)
test_group = group_data(test_data_fmt)
valid_group = group_data(valid_data_fmt)

In [10]:
def restrict_data(data, restrict):
    """
    gives the dataset where (# of reviews for a user) >= restrict
    """
    dfs = group_data(data)
    pop_keys = []
    for k in dfs:
        if len(dfs[k]) < restrict:
            pop_keys.append(k)
            
    for k in pop_keys:
        #print(k)
        dfs.pop(k)
    
    return pd.concat(dfs.values(), ignore_index=True)
    #for k in dfs:
    #    if new_df is None:
    #        new_df = dfs[k]
    #    else:
    #        new_df.append(dfs[k])
    
    #return new_df
   

In [12]:
train_data_fmt

Unnamed: 0,Userscore,Username,Game_ID
0,9,SergeantSoz,119
1,9,SergeantSoz,1540
2,8,SergeantSoz,2227
3,8,SergeantSoz,2941
4,9,tomcrew10,238
...,...,...,...
105853,10,Supersaiyan9000,198
105854,10,Supersaiyan9000,218
105855,10,Supersaiyan9000,1063
105856,9,Supersaiyan9000,2137


In [13]:
print(len(train_data_fmt))
print(len(test_data_fmt))

105858
31221


In [14]:
# base rmse
# RMSE = \sqrt(MSE)
# MSE = (1/n) * \sum (Y_pred - Y_expected)^2
def get_base_rmse():
    # y_pred = expected value of the train score
    y_pred = train_data['Userscore'].mean()
    n = len(test_reviews_group)
    
    scores_squared_sum = 0
    for user in test_reviews_group:
        # y_expected probably incorrect: select .iloc[0]?
        #y_expected = test_reviews_group[user]["Userscore"].iloc[0]
        y_expected = test_reviews_group[user]["Userscore"].mean()
        scores_squared_sum += (y_pred - y_expected)**2
        
    return sqrt(scores_squared_sum / n)
        
    #expected_val_score
    
def get_base_rmse2(train, test):
    y_pred = train['Userscore'].mean()
    rep = np.repeat(y_pred, len(test['Userscore']))
    return mean_squared_error(test['Userscore'], rep, squared=False)
    
get_base_rmse2(train_data_fmt, test_data_fmt)
#get_base_rmse2(train_data_fmt_3, test_data_fmt_3)

2.7464267971419734

# AI stuff here

In [15]:
from fastai.tabular.all import *
from fastai.collab import *
#from fastai.learner import *
#from fastai.column_data import *

In [19]:
user_name_var = "Username"
rating_name_var = "Userscore"
item_name_var = "Game_ID"
batch_size = 64 # hyperparam?
valid_pct=0 # sets validation set to 0 lmao
seed = 69

# TODO attempt to normalize????
dls = CollabDataLoaders.from_df(train_data_fmt,
                                user_name=user_name_var,
                                rating_name=rating_name_var,
                                item_name=item_name_var,
                                valid_pct=valid_pct,
                                bs=batch_size,
                                seed=seed
                                #procs=[Normalize]
                               )

# what if instead of using 10% of the brain, we used 0% of the brain?
# hack to set the validation set of dls to whatever dataset we want
dls_test = CollabDataLoaders.from_df(test_data_fmt,
                                user_name=user_name_var,
                                rating_name=rating_name_var,
                                item_name=item_name_var,
                                valid_pct=valid_pct,
                                bs=batch_size,
                                seed=seed
                                #procs=[Normalize]
                               )

dls_valid = CollabDataLoaders.from_df(valid_data_fmt,
                                user_name=user_name_var,
                                rating_name=rating_name_var,
                                item_name=item_name_var,
                                valid_pct=valid_pct,
                                bs=batch_size,
                                seed=seed
                                #procs=[Normalize]
                               )

In [20]:
# hyperparam?
n_factors = 50

dls.valid = dls_valid.train

# use_nn = False to use embedded matrices (EmbeddingDotBias)
learn = collab_learner(dls, n_factors=50, y_range=(0, 10), use_nn=False)

In [21]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)
#learn.fit_one_cycle(10, 10e-4, wd=0.1)

epoch,train_loss,valid_loss,time
0,7.949372,11.123631,00:07
1,5.624401,9.542181,00:07
2,4.422774,9.430603,00:07
3,3.338381,9.40868,00:07
4,2.67721,9.422246,00:07


In [22]:
dls.valid = dls_test.train
preds, targs = learn.get_preds()
mean_squared_error(preds.numpy(), np.ravel(targs.numpy()), squared=False)

3.1522613