In [1]:
# import modules
import copy
import time
from sys import path
from collections import defaultdict

In [2]:
import pandas as pd
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [3]:
path.append('../code/src')

In [4]:
import post_rec
import inter_rec
import TF_algo

In [5]:
# constants
# file path
input_path = "../data/"
output_path = '../result/'
rate_file = 'reduced_rating_data.csv'
attr_file = 'recipe_data.csv'

# user/item max
user_max = 1000
food_max = 5000

# const count
const_count = 3

In [6]:
# Dictionary keys
rs1 = 'w/o Constraint'
rs2 = 'Post-Single-'
rs3 = 'Inter-Single-'
val_r = 'RMSE'
val_t1 = 'train time(s)'
val_t2 = 'exec time(s)'

In [7]:
val_dict = {
    val_r: 0,
    val_t1: 0,
    val_t2: 0
}

keys = [rs1]
keys = keys + [rs2 + str(i) for i in range (1, 4)]
keys = keys + [rs3 + str(i) for i in range (1, 4)]

result = {key: copy.deepcopy(val_dict) for key in keys}

In [8]:
# read previous result
result_df = pd.read_csv(output_path + 'eval_RMSE.csv', index_col = 0).transpose()
result.update(result_df.to_dict())
result

{'w/o Constraint': {'RMSE': 0.7707426668304809,
  'train time(s)': 0.2732963562011719,
  'exec time(s)': 14.870160579681396},
 'Post-Single-1': {'RMSE': 0.8064826363063325,
  'train time(s)': 0.0883428255716959,
  'exec time(s)': 16.762537240982056},
 'Post-Single-2': {'RMSE': 0.8103881016749224,
  'train time(s)': 0.0844310919443766,
  'exec time(s)': 5.9057542483011884},
 'Post-Single-3': {'RMSE': 0.8061883460149426,
  'train time(s)': 0.0857704480489095,
  'exec time(s)': 9.389020363489786},
 'Inter-Single-1': {'RMSE': 0, 'train time(s)': 0, 'exec time(s)': 0},
 'Inter-Single-2': {'RMSE': 0, 'train time(s)': 0, 'exec time(s)': 0},
 'Inter-Single-3': {'RMSE': 0, 'train time(s)': 0, 'exec time(s)': 0}}

# RMSE of RS w/o Constraint
Check Suprise SVD performance with given rating data

In [9]:
# Source From: https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [10]:
def run_SVD():    
    # get data
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 5))
    data = Dataset.load_from_file(input_path + rate_file, reader = reader)
    
    train_set, test_set = train_test_split(data, test_size=0.25)
    anti_set = train_set.build_anti_testset()
    
    # use SVD algorithm
    algo = SVD()
    
    # train
    start = time.time()
    algo.fit(train_set)
    t1 = time.time() - start
    
    # predict rating for test-set
    predict_test = algo.test(test_set)
    r = accuracy.rmse(predict_test, False)

    # get top-n for anti-test-set
    start = time.time()
    predict_full = algo.test(anti_set)
    get_top_n(predict_full)
    t2 = time.time() - start
    
    return r, t1, t2

In [11]:
r, t1, t2 = run_SVD()
result[rs1][val_r] = r
result[rs1][val_t1] = t1
result[rs1][val_t2] = t2

# RMSE of Post-Rec w/ Single Type Constraint

In [12]:
def run_post(ctype, idx, save_result = True):    
    # PostRec applies constraint after the rating of each item is predicted
    rec = post_rec.PostRec(input_path + rate_file, input_path + attr_file, 
                           input_path + 'const_' + str(ctype) + '.' + str(idx) + '.csv',
                            split = True)
    
    rec.get_data()  # get rating, attribute, recipe data
    
    # train with data
    start = time.time()
    rec.train()  
    t1 = time.time() - start
    
    # predict rating for test-set
    predict_test = rec.test_rmse()
    r = accuracy.rmse(predict_test, False)
    
    # get top-n for anti-test-set
    start = time.time()
    rec.test()
    top_n_df = rec.get_top_n()
    t2 = time.time() - start

    if save_result:       
        top_n_df.to_csv(output_path + 'PostRec_' + str(ctype) + '.' + str(idx) + '.csv')
        
    return r, t1, t2

In [13]:
for i in range (1, 4):
    r_sum = 0
    t1_sum = 0
    t2_sum = 0
    for j in range (1, const_count + 1):
        r, t1, t2 = run_post(i, j, True)
        r_sum = r_sum + r
        t1_sum = t1_sum + t1
        t2_sum = t2_sum + t2
        print('Const_'+str(i)+'.'+str(j)+" done")
        
    result[rs2+str(i)][val_r] = r_sum/const_count
    result[rs2+str(i)][val_t1] = t1_sum/const_count
    result[rs2+str(i)][val_t2] = t2_sum/const_count

Const_1.1 done
Const_1.2 done
Const_1.3 done
Const_2.1 done
Const_2.2 done
Const_2.3 done
Const_3.1 done
Const_3.2 done
Const_3.3 done


### Sample Dataframe

In [14]:
pd.read_csv(output_path + 'PostRec_1.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
309,421,2551,3286,4336,2862,3900,3846,1202,925,3802,6426,,
564,3086,332,2525,1880,173,4032,4097,3003,3782,2639,2320,,
560,4895,2996,1686,1089,3985,1481,1493,3866,1069,3342,7233,,
25,3102,193,361,246,598,474,4652,2082,4814,4950,5006,,
873,2931,569,4517,361,157,885,265,127,4482,3497,6276,,


In [15]:
pd.read_csv(output_path + 'PostRec_2.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
309,3823,917,4791,3226,303,300,989,1829,1048,1918,,6270,
564,2280,1690,1951,648,586,4290,4652,2949,3459,2455,,332,
560,193,2610,708,903,2355,3338,4005,4683,4836,3900,,5180,
25,3459,3565,4091,181,3102,983,704,2356,1205,4960,,332,
873,648,2949,1921,112,361,989,4501,4807,1362,3820,,6270,


In [16]:
pd.read_csv(output_path + 'PostRec_3.1.csv', index_col = 0).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,i1,i2,nl
309,2691,1829,2904,4842,1740,217,4634,4188,388,4305,,,"[245.1, 7.0, 7.0, 28.0, 83.0, 6.0, 1.0]"
564,1152,2794,4826,1053,1690,4995,3305,1984,2609,3557,,,"[35.9, 1.0, 5.0, 6.0, 5.0, 3.0, 1.0]"
560,1957,331,183,855,1871,4338,1637,3873,617,3268,,,"[9.2, 0.0, 2.0, 1.0, 2.0, 0.0, 0.0]"
25,4169,2454,361,246,4118,225,960,852,647,4493,,,"[32.2, 5.0, 0.0, 0.0, 0.0, 2.0, 0.0]"
873,2274,2194,740,2549,2311,3452,1241,104,203,475,,,"[61.8, 0.0, 42.0, 7.0, 0.0, 0.0, 5.0]"


# RMSE of Inter-Rec w/ Single Type Constraint

In [24]:
def run_inter(ctype, idx, save_result = True):    
    # PostRec applies constraint after the rating of each item is predicted
    rec = inter_rec.InterRec(input_path + rate_file, input_path + attr_file, 
                           input_path + 'const_' + str(ctype) + '.' + str(idx) + '.csv', 
                            TF_algo.SVDtf(), split = True)
    
    rec.get_data()  # get rating, attribute, recipe data
    
    # train with data
    start = time.time()
    rec.train()  
    t1 = time.time() - start
    
    # predict rating for test-set
    predict_test = rec.test_rmse()
    r = accuracy.rmse(predict_test, False)
    
    # get top-n for anti-test-set
    start = time.time()
    rec.test()
    top_n_df = rec.get_top_n()
    t2 = time.time() - start

    if save_result:       
        top_n_df.to_csv(output_path + 'InterRec' + str(ctype) + '.' + str(idx) + '.csv')
        
    return r, t1, t2

In [25]:
for i in range (1, 4):
    r_sum = 0
    t1_sum = 0
    t2_sum = 0
    for j in range (1, const_count + 1):
        r, t1, t2 = run_inter(i, j, True)
        r_sum = r_sum + r
        t1_sum = t1_sum + t1
        t2_sum = t2_sum + t2
        print('Const_'+str(i)+'.'+str(j)+" done")
        
    result[rs3+str(i)][val_r] = r_sum/const_count
    result[rs3+str(i)][val_t1] = t1_sum/const_count
    result[rs3+str(i)][val_t2] = t2_sum/const_count

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
pd.read_csv(output_path + 'InterRec_1.1.csv', index_col = 0).head()

In [None]:
pd.read_csv(output_path + 'InterRec_2.1.csv', index_col = 0).head()

In [None]:
pd.read_csv(output_path + 'InterRec_3.1.csv', index_col = 0).head()

# Result

In [None]:
result_df = pd.DataFrame.from_dict(result, orient='index', columns=['RMSE', 'train time(s)', 'exec time(s)'])
result_df.to_csv(output_path + 'eval_RMSE.csv')
result_df