In [1]:
# import modules
import copy
import time
from sys import path
from collections import defaultdict

In [2]:
import pandas as pd
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [3]:
path.append('../code/src')

In [4]:
import svd_constraint
from post_rec import PostRec
from inter_rec import InterRec
from evaluate import Evaluation as ev

In [5]:
# constants
# file path
input_path = "../data/"
output_path = '../result/'
rate_file = 'reduced_rating_data.csv'
attr_file = 'recipe_data.csv'

# user/item max
user_max = 1000
food_max = 5000

# const count
const_count = 3

In [6]:
# Dictionary keys
rs1 = 'w/o Constraint'
rs2 = 'Post-Single-'
rs3 = 'Inter-Single-'
val_r1 = 'RMSE'
val_r2 = 'nDCG'
val_t1 = 'train time(s)'
val_t2 = 'exec time(s)'

In [7]:
val_dict = {
    val_r1: 0,
    val_r2: 0,
    val_t1: 0,
    val_t2: 0
}

keys = [rs1]
keys = keys + [rs2 + str(i) for i in range (1, 4)]
keys = keys + [rs3 + str(i) for i in range (1, 4)]

result = {key: copy.deepcopy(val_dict) for key in keys}

In [8]:
# read previous result
result_df = pd.read_csv(output_path + 'evaluation.csv', index_col = 0).transpose()
result.update(result_df.to_dict())
result

{'w/o Constraint': {'RMSE': 0.843046724743406,
  'nDCG': 0.0064600564955147,
  'train time(s)': 0.2715747356414795,
  'exec time(s)': 14.656277418136597},
 'Post-Single-1': {'RMSE': 4.699165956844706,
  'nDCG': 0.0691716426770259,
  'train time(s)': 0.2614331245422363,
  'exec time(s)': 48.98903234799703},
 'Post-Single-2': {'RMSE': 4.699165956844706,
  'nDCG': 0.0096320025599238,
  'train time(s)': 0.2636383374532063,
  'exec time(s)': 17.429793119430542},
 'Post-Single-3': {'RMSE': 0.8389790618943304,
  'nDCG': 0.0203135874572876,
  'train time(s)': 0.261168638865153,
  'exec time(s)': 32.93567609786987},
 'Inter-Single-1': {'RMSE': 3.9436658434502223,
  'nDCG': 0.0595799518591563,
  'train time(s)': 253.56926131248477,
  'exec time(s)': 9.6801544825236},
 'Inter-Single-2': {'RMSE': 2.9195610475642666,
  'nDCG': 0.0141510659035121,
  'train time(s)': 257.82492852211,
  'exec time(s)': 17.210232098897297},
 'Inter-Single-3': {'RMSE': 0.8289835314402675,
  'nDCG': 0.0210017769144157,
 

# Evaluate RS w/o Constraint
Check Suprise SVD performance with given rating data

In [9]:
# Source From: https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    ret = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
        ret[int(uid)] = [int(i) for i, r in top_n[uid]]
    
    return ret

In [10]:
def run_SVD():    
    # get data
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 5))
    data = Dataset.load_from_file(input_path + rate_file, reader = reader)
    
    train_set, test_set = train_test_split(data, test_size=0.25)
    anti_set = train_set.build_anti_testset()
    
    # use SVD algorithm
    algo = SVD()
    
    # train
    start = time.time()
    algo.fit(train_set)
    t1 = time.time() - start
    
    # predict rating for test-set
    predict_test = algo.test(test_set)
    r1 = ev.calculate_rmse(predict_test)

    # get top-n for anti-test-set
    start = time.time()
    predict_full = algo.test(anti_set)
    top_n = get_top_n(predict_full)
    t2 = time.time() - start
    
    # predict nDCG for test-set
    rel_dict = defaultdict(list)
    for (u, i, r) in test_set:
        if r >= 4:
            rel_dict[int(u)].append(int(i))
    
    top_n_df = pd.DataFrame.from_dict(top_n, orient='index')
    top_n_df = top_n_df.reindex(columns=[x for x in range(0, 10)])
    r2 = ev.calculate_ndcg(rel_dict, top_n_df, 10)
    
    return r1, r2, t1, t2

In [11]:
r1, r2, t1, t2 = run_SVD()
result[rs1][val_r1] = r1
result[rs1][val_r2] = r2
result[rs1][val_t1] = t1
result[rs1][val_t2] = t2

# Evaluate Post-Rec w/ Single Type Constraint

In [12]:
def run_post(ctype, idx, save_result = True):    
    # PostRec applies constraint after the rating of each item is predicted
    rec = PostRec(input_path + rate_file, input_path + attr_file, 
                           input_path + 'const_' + str(ctype) + '.' + str(idx) + '.csv',
                            split = True)
    
    rec.get_data()  # get rating, attribute, recipe data
    
    # train with data
    start = time.time()
    rec.train()  
    t1 = time.time() - start
    
    # predict rating for test-set
    predict_test = rec.test_rmse()
    r1 = ev.calculate_rmse(predict_test)
       
    # get top-n for anti-test-set
    start = time.time()
    rec.test()
    top_n_df = rec.get_top_n()
    t2 = time.time() - start
    
    # calculate ndcg
    r2 = ev.calculate_ndcg(rec.get_rel(), top_n_df, 10)
    
    if save_result:       
        top_n_df.to_csv(output_path + 'PostRec_' + str(ctype) + '.' + str(idx) + '.csv')
        
    return r1, r2, t1, t2

In [13]:
for i in range (1, 4):
    r1_sum = 0
    r2_sum = 0
    t1_sum = 0
    t2_sum = 0
    for j in range (1, const_count + 1):
        r1, r2, t1, t2 = run_post(i, j, True)
        r1_sum = r1_sum + r1
        r2_sum = r2_sum + r2
        t1_sum = t1_sum + t1
        t2_sum = t2_sum + t2
        print('Const_'+str(i)+'.'+str(j)+" done")
        
    result[rs2+str(i)][val_r1] = r1_sum/const_count
    result[rs2+str(i)][val_r2] = r2_sum/const_count
    result[rs2+str(i)][val_t1] = t1_sum/const_count
    result[rs2+str(i)][val_t2] = t2_sum/const_count

Const_1.1 done
Const_1.2 done
Const_1.3 done
Const_2.1 done
Const_2.2 done
Const_2.3 done
Const_3.1 done
Const_3.2 done
Const_3.3 done


### Sample Dataframe

In [14]:
pd.read_csv(output_path + 'PostRec_1.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
0,1835,1715,1751,2657,4336,3600,1454,3942,828,425,1505,,
2,1224,1581,1493,3866,4318,1957,15,4895,634,4233,7233,,
3,1860,1060,870,396,4714,4138,1277,1650,4302,3431,3668,,
4,267,1561,4674,689,2651,1181,3923,4408,1395,2221,4574,,
5,1984,234,2779,1704,4570,2723,4207,3187,1714,2075,332,,


In [15]:
pd.read_csv(output_path + 'PostRec_2.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
0,4992,4954,2848,2496,2280,2232,193,145,112,4005,,840,
2,3813,827,4865,4005,486,4870,1611,4016,4372,246,,6270,
3,4352,3459,2949,2820,2280,1983,1936,1705,1690,240,,7449,
4,203,116,4683,3318,18,3164,3491,3883,519,2614,,5010,
5,1921,388,3258,3483,4177,3066,2522,2509,2274,3731,,6270,


In [16]:
pd.read_csv(output_path + 'PostRec_3.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
0,1578,4005,4578,1580,4169,1690,2509,2792,416,276,,,"[204.8, 5.0, 9.0, 26.0, 24.0, 2.0, 10.0]"
2,2949,2075,3227,1854,1813,4782,4340,138,2811,382,,,"[132.3, 11.0, 39.0, 5.0, 4.0, 11.0, 5.0]"
3,2130,4695,3392,2468,3317,3212,3318,2280,3877,1241,,,"[29.4, 0.0, 23.0, 4.0, 0.0, 0.0, 2.0]"
4,4791,2949,2632,3342,4913,4340,171,3923,430,391,,,"[90.4, 6.0, 27.0, 4.0, 1.0, 11.0, 4.0]"
5,2280,2453,2194,4106,4341,827,4511,3212,3713,1893,,,"[206.4, 0.0, 201.0, 0.0, 1.0, 0.0, 18.0]"


# Evaluate Inter-Rec w/ Single Type Constraint

In [17]:
def run_inter(ctype, idx, save_result = True):    
    # PostRec applies constraint after the rating of each item is predicted
    rec = InterRec(input_path + rate_file, input_path + attr_file, 
                           input_path + 'const_' + str(ctype) + '.' + str(idx) + '.csv', 
                            svd_constraint.CnstSVD(), split = True)
    
    rec.get_data()  # get rating, attribute, recipe data
    
    # train with data
    start = time.time()
    rec.train()  
    t1 = time.time() - start
    
    # predict rating for test-set
    predict_test = rec.test_rmse()
    r1 = ev.calculate_rmse(predict_test)
    
    # get top-n for anti-test-set
    start = time.time()
    rec.test()
    top_n_df = rec.get_top_n()
    t2 = time.time() - start

    # calculate nDCG
    r2 = ev.calculate_ndcg(rec.get_rel(), top_n_df, 10)
    
    if save_result:       
        top_n_df.to_csv(output_path + 'InterRec_' + str(ctype) + '.' + str(idx) + '.csv')
        
    return r1, r2, t1, t2

In [18]:
for i in range (1, 4):
    r1_sum = 0
    r2_sum = 0
    t1_sum = 0
    t2_sum = 0
    for j in range (1, const_count + 1):
        r1, r2, t1, t2 = run_inter(i, j, True)
        r1_sum = r1_sum + r1
        r2_sum = r2_sum + r2
        t1_sum = t1_sum + t1
        t2_sum = t2_sum + t2
        print('Const_'+str(i)+'.'+str(j)+" done")
        
    result[rs3+str(i)][val_r1] = r1_sum/const_count
    result[rs3+str(i)][val_r2] = r2_sum/const_count
    result[rs3+str(i)][val_t1] = t1_sum/const_count
    result[rs3+str(i)][val_t2] = t2_sum/const_count

Const_1.1 done
Const_1.2 done
Const_1.3 done
Const_2.1 done
Const_2.2 done
Const_2.3 done
Const_3.1 done
Const_3.2 done
Const_3.3 done


In [19]:
pd.read_csv(output_path + 'InterRec_1.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
309,2246,181,3386,2773,1482,4877,2273,3047,2168,2294,6426,,
564,461,1029,1667,4338,174,1611,2639,885,2970,4640,2320,,
560,4560,4101,3338,458,1069,3781,1686,4579,3342,2507,7233,,
25,484,193,4910,80,2792,2086,1734,1983,1517,361,5006,,
873,361,4482,2681,2880,1818,1791,4683,127,3371,4209,6276,,


In [20]:
pd.read_csv(output_path + 'InterRec_2.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
309,2949,361,3145,1646,2784,3102,309,1418,1667,2975,,6270,
564,2280,1690,1951,648,4352,4652,1357,3459,1611,2820,,332,
560,1601,193,4783,945,4799,465,4218,453,2342,647,,5180,
25,2508,3565,276,2551,1122,4767,2520,903,4106,4091,,332,
873,2949,3459,3102,4270,3085,4686,2894,1740,1129,1559,,6270,


In [21]:
pd.read_csv(output_path + 'InterRec_3.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
309,484,4609,3022,2988,3201,4814,3670,269,1069,1276,,,"[245.1, 7.0, 7.0, 28.0, 83.0, 6.0, 1.0]"
564,1690,3459,1611,983,4152,4945,617,1169,276,4177,,,"[35.9, 1.0, 5.0, 6.0, 5.0, 3.0, 1.0]"
560,1690,2890,2595,4567,4338,3068,1353,303,4352,3942,,,"[9.2, 0.0, 2.0, 1.0, 2.0, 0.0, 0.0]"
25,3531,3337,2396,484,3519,4683,2991,4847,1471,2055,,,"[32.2, 5.0, 0.0, 0.0, 0.0, 2.0, 0.0]"
873,2280,740,4524,3479,2274,1618,1899,19,2120,2803,,,"[61.8, 0.0, 42.0, 7.0, 0.0, 0.0, 5.0]"


# Result

In [25]:
result_df = pd.DataFrame.from_dict(result, orient='index', columns=['RMSE', 'nDCG', 'train time(s)', 'exec time(s)'])
result_df.to_csv(output_path + 'evaluation.csv')
result_df

Unnamed: 0,RMSE,nDCG,train time(s),exec time(s)
w/o Constraint,0.768282,0.005193,0.259307,14.500491
Post-Single-1,4.592464,0.091282,0.255975,48.29167
Post-Single-2,0.987992,0.013738,0.255359,17.16264
Post-Single-3,0.840098,0.017505,0.256979,31.868904
Inter-Single-1,3.89751,0.07447,253.9508,9.562867
Inter-Single-2,0.926631,0.013082,265.938126,15.94048
Inter-Single-3,0.827239,0.017014,380.395853,18.645201
