## Evaluation of CDL_* predictions

In [6]:
import pickle
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict, namedtuple
from experiment_out_utils import precision_recall_at_k_4df, write_to_csv, XPData, XPRow, write_row
import pandas as pd
from sklearn.model_selection import train_test_split

### Load data

In [None]:
## define paths
df = pd.read_json(r'D:\Datasets\amazon_reviews\processed\reviews_Video_Games_5.json')
XP_PATH = 'D:/Models/thesis/conv_mf/test1/'
# XP_PATH = 'D:/Models/thesis/sdae/sdae_optimized/'
U_V_PATH = '%spickles/' % XP_PATH

In [9]:
df_train, df_test = train_test_split(df, test_size = 0.3, stratify=df['reviewerID'], random_state=42)

In [10]:
df_train.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,summaryProc,unixReviewTime
147833,B002WSR8CQ,"[0, 0]",4,I BROUGHT THIS GAME FOR MY SON AND HE LOVES IT...,i bring this game for my son and he love it th...,"01 7, 2011",A3P3XF9FS9AJCH,C. SESSIONS,VERY FUN,very fun,1294358400
95387,B000X1TC0U,"[4, 4]",5,"I'm very late to the party with this review, s...",i be late party review i will informal short i...,"01 20, 2009",A3KKM0T1KY42HA,Sky,Aliens,alien,1232409600


In [11]:
pivoted = df_train.pivot(index = 'reviewerID', columns = 'asin', values = 'overall')
pivoted = pivoted.fillna(0.)

In [12]:
## Load matricies
with open(U_V_PATH + 'U_final.pickle', 'rb') as handle:
    U = pickle.load(handle)  
    
with open(U_V_PATH + 'V_final.pickle', 'rb') as handle:
    V = pickle.load(handle)
    
with open(U_V_PATH + 'beta_u_final.pickle', 'rb') as handle:
    beta_u = pickle.load(handle)
    
with open(U_V_PATH + 'beta_v_final.pickle', 'rb') as handle:
    beta_v = pickle.load(handle)
    
with open(XP_PATH + 'rating_matrix.pickle', 'rb') as handle:
    rating_matrix = pickle.load(handle)

In [13]:
predictions = np.dot(U, V.T) + beta_u.reshape(-1, 1) + beta_v.reshape(1, -1)

In [14]:
print("Shape of rating matrix: %s x %s" % rating_matrix.shape)
print("Shape of predicted matrix: %s x %s" % predictions.shape)

Shape of rating matrix: 24303 x 10668
Shape of predicted matrix: 24303 x 10668


### For training set

In [15]:
print("MSE (non zero): %s" % mean_squared_error(rating_matrix[rating_matrix > 0], predictions[rating_matrix > 0]) ** 0.5)
print("MAE (non zero): %s" % mean_absolute_error(rating_matrix[rating_matrix > 0], predictions[rating_matrix > 0]) ** 0.5)

MSE (non zero): 0.11504065707800937
MAE (non zero): 0.2913370284785628


In [16]:
print("MSE (all): %s" % mean_squared_error(rating_matrix, predictions) ** 0.5)
print("MAE (all): %s" % mean_absolute_error(rating_matrix, predictions) ** 0.5)

MSE (all): 3.824807636110569
MAE (all): 1.922837208995061


### For test set

In [17]:
preds_df_unmelt = pd.DataFrame(predictions, columns = pivoted.columns, index = pivoted.index)
preds_df_unmelt.index.name = 'reviewerID'
preds_df_unmelt.columns.name = 'asin'
preds_df_unmelt.head(2)

asin,0700099867,6050036071,7100027950,7293000936,8176503290,907843905X,9625990674,9861019731,9882155456,B000003SQQ,...,B00J128FPA,B00J226358,B00J6DLPLK,B00J9P3KBS,B00JM3R6M6,B00JQ8YH6A,B00JQHU9RC,B00JXW6GE0,B00KAI3KW2,B00KHECZXO
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00263941WP7WCIL7AKWL,3.438583,4.737486,5.069465,4.760501,4.416206,4.603362,4.607568,4.41733,4.658463,4.228462,...,4.73128,4.11877,4.264728,3.847696,4.567197,3.485262,5.257329,4.51751,4.459819,4.171296
A005481137I9SCAWEF7ON,4.650667,4.708318,4.90516,4.701447,4.779674,3.157856,4.814225,4.043312,5.464767,4.098143,...,4.336104,4.246659,1.325666,4.744281,2.786003,3.878325,5.332501,4.119889,4.632941,3.221304


In [18]:
df_test_val = df_test.copy()
df_test_val['value'] = 0

In [19]:
def get_val(x):
    if x['reviewerID'] in preds_df_unmelt.index:
        if x['asin'] in preds_df_unmelt.columns:
            return preds_df_unmelt.loc[x['reviewerID'], x['asin']]
    return None

In [20]:
df_test_val['value'] = df_test_val.apply(get_val, axis = 1)

In [21]:
df_test_val[['overall', 'value']].head(10)

Unnamed: 0,overall,value
193452,4,4.623163
166816,4,4.366657
65300,5,4.025418
156132,4,3.072125
125773,1,2.303838
206313,4,4.616288
160819,5,3.340348
105469,2,0.79786
217821,4,4.359666
63640,2,4.088295


In [22]:
df_test_val[~df_test_val.value.isnull()].head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,summaryProc,unixReviewTime,value
193452,B005QA98JS,"[1, 1]",4,This is another set of games that sort of surp...,this set game sort surprise good set this set ...,"10 18, 2012",AFXTKAO0CB354,C. Weaver,Aonther Set Of Games That Surprised Me....,aonther set of games that surprise me,1350518400,4.623163
166816,B0043QL2FE,"[4, 7]",4,"After playing this game a lot more, I have dec...",after play game lot i decide change review ini...,"03 16, 2011",A4E0I88T1MS4O,Fani,Solid improvement from Top Spin 3 but still ha...,solid improvement top spin 3 flaw,1300233600,4.366657


In [23]:
mse = mean_squared_error(df_test[~df_test_val.value.isnull()].overall, df_test_val[~df_test_val.value.isnull()].value) ** 0.5
mae = mean_absolute_error(df_test[~df_test_val.value.isnull()].overall, df_test_val[~df_test_val.value.isnull()].value)

print("MSE: %s" % mse)
print("MAE: %s" % mae)

MSE: 1.23490158837977
MAE: 0.9320325889665328


In [24]:
k_prec = {}
k_rec = {}

for k in range(0, 200):
    precisions, recalls = precision_recall_at_k_4df(df_test_val, k=k, threshold=3) 
    p_mean = np.mean(list(precisions.values()))
    r_mean = np.mean(list(recalls.values()))
    k_prec[k] = p_mean
    k_rec[k] = r_mean

In [35]:
row = XPRow(dataset='Video Games', xpdata=XPData(predictor=None, label='CDL-SDAE', nfactors=10), rmse=mse, mae=mae, precision=k_prec, recall=k_rec)

NameError: name 'mse' is not defined

In [None]:
write_to_csv(row, XP_NAME)