## Imports

In [2]:
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset

from surprise import SVD, KNNBasic, NMF, SlopeOne, CoClustering #other knn, randoms, not svdpp because were not doing implicit               # importer ici les algo qu'on testera
from surprise import model_selection
from surprise import dump

from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("Datasets/data_train.csv")

df[["user", "item"]] = df.Id.str.split("_", expand=True)

df.user = df.user.str.replace("r", "")
df.item = df.item.str.replace("c", "")

#########
df2 = pd.read_csv("Datasets/sample_submission.csv")

df2[["user", "item"]] = df2.Id.str.split("_", expand=True)

df2.user = df2.user.str.replace("r", "")
df2.item = df2.item.str.replace("c", "")
prev_size=df.shape[0]
df

Unnamed: 0,Id,Prediction,user,item
0,r44_c1,4,44,1
1,r61_c1,3,61,1
2,r67_c1,4,67,1
3,r72_c1,3,72,1
4,r86_c1,5,86,1
...,...,...,...,...
1176947,r9990_c1000,4,9990,1000
1176948,r9992_c1000,5,9992,1000
1176949,r9994_c1000,3,9994,1000
1176950,r9997_c1000,4,9997,1000


## Data cleaning

In [None]:
#Before re-running this cell, re-run the previous one. 
#If you are not happy with this cleaning set the two min_... var to 0. 
#Ok keep it to 0 for the moment -Alix 03/12/19. 
min_nb_of_users=0
min_nb_of_items=0
occur_user=df.user.value_counts()
df_user_mod = df[~df.user.isin(occur_user[occur_user<=min_nb_of_users].index)]
occur_item=df_user_mod.item.value_counts()
df=df_user_mod[~df_user_mod.item.isin(occur_item[occur_item<=min_nb_of_items].index)]
print(f"Removed {prev_size-df.shape[0]} colunms")

### Plotting

In [None]:
df.Prediction.value_counts().plot.bar()

## Création des datasets pour train and test

In [4]:
reader = Reader(rating_scale=(1,5)) 
data_train = Dataset.load_from_df(df[["user","item","Prediction"]], reader)
data_test = Dataset.load_from_df(df2[["user","item","Prediction"]], reader)

## Training de chaque algo

### SVD

In [None]:
param_grid = {
    'n_factors' : [200],
    'n_epochs': [1000],
    'lr_all': [0.004],
    'reg_all': [0.07]
}
cv=5
algorithm = SVD

gs = model_selection.GridSearchCV(algorithm, param_grid, measures=['rmse'], cv=cv, n_jobs=-5, joblib_verbose=10, return_train_measures=True)  #enlever mae car non utilisé dans le projet pour sauver du temps
gs.fit(data_train) #checker quelle data est utilisée a chaque algo
print(gs.best_params)
print(gs.best_score)
algo = gs.best_estimator["rmse"]
algo.fit(data_train.build_full_trainset())
dump.dump("dump/SVDfitted_dump", algo=algo, verbose=1)

[Parallel(n_jobs=-5)]: Using backend LokyBackend with 4 concurrent workers.


### KNN

In [None]:
param_grid = {
    'k' : [10, 40, 100],
    'min_k': [1, 3, 5]
} 
cv=5
algorithm = KNNBasic

gs = model_selection.GridSearchCV(algorithm, param_grid, measures=['rmse'], cv=cv, n_jobs=-7, joblib_verbose=10)  #enlever mae car non utilisé dans le projet pour sauver du temps
gs.fit(data_train)
print(gs.best_params)
print(gs.best_score)
algo = gs.best_estimator["rmse"]
algo.fit(data_train.build_full_trainset())
dump.dump("dump/KNN_basic_fitted_dump", algo=algo, verbose=1)

### NFM

In [None]:
param_grid = {
    'n_factors' : [15, 50],
    'n_epochs': [10, 50]
} 
cv=5
algorithm = NMF

gs = model_selection.GridSearchCV(algorithm, param_grid, measures=['rmse'], cv=cv, n_jobs=-5, joblib_verbose=10)  #enlever mae car non utilisé dans le projet pour sauver du temps
gs.fit(data_train)
print(gs.best_params)
print(gs.best_score)
algo = gs.best_estimator["rmse"]
algo.fit(data_train.build_full_trainset())
dump.dump("dump/NMFfitted_dump", algo=algo, verbose=1)

### SlopeOne

In [None]:
algo = SlopeOne()
algo.fit(data_train.build_full_trainset())
dump.dump("dump/SlopeOne_fitted_dump", algo=algo, verbose=1)

### CoClustering

In [None]:
param_grid = {
    'n_cltr_u' : [3, 5],
    'n_cltr_i' : [3, 5], 
    'n_epochs': [50, 100]
} 
cv=5
algorithm = CoClustering

gs = model_selection.GridSearchCV(algorithm, param_grid, measures=['rmse'], cv=cv, n_jobs=-5, joblib_verbose=10)  #enlever mae car non utilisé dans le projet pour sauver du temps
gs.fit(data_train)
print(gs.best_params)
print(gs.best_score)
algo = gs.best_estimator["rmse"]
algo.fit(data_train.build_full_trainset())
dump.dump("dump/CoClustering_fitted_dump", algo=algo, verbose=1)

## Loading des pickles

In [None]:
del algo, gs

In [None]:
_, algo_svd = dump.load("dump/SVDfitted_dump")
_, algo_knn = dump.load("dump/KNN_basic_fitted_dump")
_, algo_nmf = dump.load("dump/NMFfitted_dump")
_, algo_slopeone = dump.load("dump/SlopeOne_fitted_dump")
_, algo_coclustering = dump.load("dump/CoClustering_fitted_dump")

## Estimations

In [None]:
test = data_test.build_full_trainset()
test = test.build_testset()

In [None]:
array_SVD = algo_svd.test(test)
array_KNN = algo_knn.test(test)
array_NMF = algo_nmf.test(test)
array_SlopeOne = algo_slopeone.test(test)
array_CoClustering = algo_coclustering.test(test)

In [None]:
'''valid_items=df.item.unique()
valid_users=df.user.unique()
global_mean=df["Prediction"].mean()
count_bad_users=0
count_bad_items=0
count_bad_both=0
global_mean=df["Prediction"].mean()

for i in df2.iterrows():
    if i[0]%100000==0:
        print(i[0])
    if (i[1]["user"] in valid_users and i[1]["item"] in valid_items):
        array_SVD[i[0]] = algo_svd.estimate(int(i[1][2])-1, int(i[1][3])-1)
        array_KNN[i[0]]= algo_knn.estimate(int(i[1][2])-1, int(i[1][3])-1)[0]
        array_NMF[i[0]] = algo_nmf.estimate(int(i[1][2])-1, int(i[1][3])-1)
        array_SlopeOne[i[0]]=algo_SlopeOne.estimate(int(i[1][2])-1, int(i[1][3])-1)
        array_CoClustering[i[0]] = algo_coclustering.estimate(int(i[1][2])-1, int(i[1][3])-1)
    elif (i[1]["user"] in valid_users):
        user_mean=df[df["user"]==i[1]["user"]].mean()
        array_SVD[i[0]] = user_mean
        array_KNN[i[0]]= user_mean
        array_NMF[i[0]] = user_mean
        array_SlopeOne[i[0]]=user_mean
        array_CoClustering[i[0]] = user_mean
        count_bad_users+=1
    elif (i[1]["item"] in valid_items):
        item_mean=df[df["item"]==i[1]["item"]].mean()
        array_SVD[i[0]] = item_mean
        array_KNN[i[0]]= item_mean
        array_NMF[i[0]] = item_mean
        array_SlopeOne[i[0]]= item_mean
        array_CoClustering[i[0]] = item_mean
        count_bad_items+=1
    else:
        array_SVD[i[0]] = global_mean
        array_KNN[i[0]]= global_mean
        array_NMF[i[0]] = global_mean
        array_SlopeOne[i[0]]= global_mean
        array_CoClustering[i[0]] = global_mean
        count_bad_both+=1
print(f"Done with {count_bad_users} bad users, {count_bad_items} bad items, {count_bad_both} bad both.")'''

### Traitement des valeurs non estimées

In [None]:
'''Here we treat the value of -1000'''
'''
mean_svd = np.mean(array_SVD[array_SVD != -1000])
array_SVD[array_SVD==-1000] = mean_svd

mean_knn = np.mean(array_KNN[array_KNN != -1000])
array_KNN[array_KNN==-1000] = mean_knn

mean_nmf = np.mean(array_NMF[array_NMF != -1000])
array_NMF[array_NMF==-1000] = mean_nmf

mean_slopeone = np.mean(array_SlopeOne[array_SlopeOne != -1000])
array_SlopeOne[array_SlopeOne==-1000] = mean_slopeone

mean_coclustering = np.mean(array_CoClustering[array_CoClustering != -1000])
array_CoClustering[array_CoClustering==-1000] = mean_coclustering
'''

#apparement plus d'erreur en faisant les -1 la cell d'avant

### Blending 

In [None]:
'''
Full blending to do
Either by 
-using some model like linear regression
-ponderation
-something else, can be worked in the blending branch

results in final_array
'''

#here we blend simply by taking the mean of all models
tmp = np.concatenate((array_SVD, array_KNN, array_NMF, array_SlopeOne, array_CoClustering), axis=1 )
final_array = np.mean(tmp, axis=1)
final_array = np.rint(final_array)
final_array[final_array>5]=5
final_array[final_array<1]=1
final_array

## Transformation en submission

In [None]:
df2.Prediction = final_array

In [None]:
df2 = df2.drop(columns=["user", "item"])

In [None]:
df2.to_csv("Datasets/submission_pipeline.csv", index=False)