In [None]:
import numpy as np
import pandas as pd
import re # Used for the conversion of "r##_c##" in only the numbers --> TODO: check where it comes from
from IPython.display import display
from helpers import *
from play_with_data import *
from pre_processing import *
from matrix_factorization import *
from cross_validation import *
import scipy.sparse as sp # In order to use sparse 
# Predictors imported in performance order (best to worst, according to http://surpriselib.com/)
from surprise import SVDpp
from surprise import KNNBaseline
from surprise import SVD
from surprise import SlopeOne
from surprise import BaselineOnly
from surprise import KNNWithMeans
from surprise import NMF
from surprise import CoClustering
from surprise import KNNBasic
from surprise import KNNWithZScore # not scored --> to be tested quickly
from surprise import dataset
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import GridSearch

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
%load_ext autoreload
%autoreload 2

# Done before the exercise
- This will create a numpy matrix from the data given "data_train.csv"
- Then we will use this to show some statistics about the data

In [None]:
#********* Load the given data **********
r_c, x = load_data_old('./data_train.csv') #r_c contain the position (user, movie) and x contain the ratings

In [None]:
#********* Creation of a matrix of the data ********
nUser = 10000
nItem = 1000
data = np.zeros([nUser, nItem]) # These numbers were given

for ind, i in enumerate(r_c): # Loop over all the ID, in order to create a numpy matrix
    data[int(re.findall('\d+', i)[0])-1, int(re.findall('\d+', i)[1])-1] = x[ind] # Use the information in the ID (row, col) to create the matrix




In [None]:
#********** Data preview ************
# Check if there is any missed data 
# It was told us that we have the data from 10'000 users for 1000 films, but we don't have all these data
info_general(nUser, nItem, x, data)
print('\n')
info_ratings(data)


With this barplot, we can see that ratings are not distributed in an uniform way, this may suggest that there is a bias in the rating matrix that has to be considered.

# Done after the exercise

In [None]:
#******** Creation of a sparse matrix of the data **********
ratings = load_data('./data_train.csv')

In [None]:
#***** Data preview *********
num_items_per_user, num_users_per_item = plot_raw_data(ratings) # Original code is from the course, ex10 'plots.py'
print("Maximum number of items per user:\t{}\nMinimum number of items per user:\t{}\n".format(np.max(num_items_per_user), np.min(num_items_per_user)))
print("Maximum number of users per item:\t{}\nMinimum number of users per item:\t{}".format(np.max(num_users_per_item), np.min(num_users_per_item)))

## Importing the testset

In [None]:
test = load_data('./sample_submission.csv')

# Done so far
- Preview of the data
- Pre-processing:
    - Choosing only the "valid ratings", the users and items that contains more than min_num_ratings
    - Splitting the data in test and train, by choosing 90% of the ratings from the valid_ratings and only the non-zeros values

In [None]:
ratings_surpr = formating_data_surprise(ratings)
ratings_surpr.split(n_folds=3)


In [None]:
################ Evaluate all the algorithms ########################
algos = [SVDpp(),KNNBaseline(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]
perf = {}
algo_str = ['SVDpp', 'KNN Baseline','SVD', 'Slope One', 'BaselineOnly', 'KNN with Z score']

for i,algo in enumerate(algos): #for algo in algos:
    # Evaluate performances of our algorithm on the dataset.
    perf[algo_str[i]] = evaluate(algo, ratings_surpr, measures=['RMSE'])
    print_perf(perf[algo_str[i]])


In [None]:
np.save('perf_dictionary.npy', perf)

# GRID SEARCH

In [None]:
param_grid = {'init_mean': [0,2,4], 
              'init_std_dev': [0.1, 0.3, 0.5],
              'lr_all': [0.002, 0.004, 0.006, 0.008, 0.01],
              'reg_all': [0.01, 0.03, 0.05, 0.07, 0.1]}

grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'FCP'],
                         verbose=False)


# First train an SVD algorithm on the movielens dataset.
ratings_ = formating_data_surprise(ratings)
#trainset = ratings_.build_full_trainset()

ratings_.split(n_folds=3)
grid_search.evaluate(ratings_)

# best RMSE score
print(grid_search.best_score['RMSE'])
# >>> 0.96117566386

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])
# >>> {'reg_all': 0.4, 'lr_all': 0.005, 'n_epochs': 10}

## Majority Creation


In [None]:
#******** Creation of a sparse matrix of the data ******
ratings = load_data('./data_train.csv')
train_ratings, test_ratings =split_data(ratings, prob_test=0.15) # splitting in train and test set 


########### Define: algo, dataset (trainset ##############
dataF_train, ratings_train = formating_data_surprise(train_ratings, True) # Create the Dataset for surprise (training set)

trainset_algo = ratings_train.build_full_trainset() # Build trainset
trainset_pred = trainset_algo.build_testset() # Build iterable object in order to test 

# C'est notre test set donc utiliser plus tard
ratings_test = formating_data_surprise(test_ratings)
validationset = ratings_test.build_full_trainset()
# -------------------------------------------------

########## Train and test the algo ###########
algorithm = [SVDpp(),KNNBaseline(),NMF(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]

prediction_df = dataF_train.copy()
for i, algo in enumerate (algorithm):
    
    algo.train(trainset_algo) # Training of the algo
    pred = algo.test(trainset_pred) # Make the prediction

    ########## Creation of the lists: row_users, col_movies, estim ########
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings
    
    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_df = pd.concat([prediction_df,temp], axis=1)
    
display(prediction_df.head())

In [None]:
second_step_dataset_df = prediction_df.copy() # Copy 
second_step_dataset_df = second_step_dataset_df.drop(second_step_dataset_df.columns[[0, 2]], axis=1) # In order to keep only the real ratings and then the predictions for all algos
second_step_dataset_df.columns = ['Label','SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly', 'KNNWithZScore'] # TODO: should be adapted
display(second_step_dataset_df.head())
np.save('predictions_allAlgos.csv', second_step_dataset_df)

In [None]:
# Cleaning the dataframe
only_prediction_df = prediction_df['prediction'] # We only take the predictions of the different algorithms
only_prediction_df = only_prediction_df.round() # We round the predictions.
only_prediction_df.columns = ['SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly','KNNWithZScore'] # Change on column name
only_prediction_df['Majority'] = 0
#display(only_prediction_df.head())

In [None]:
for i ,row in only_prediction_df.iterrows(): # We iterate over all the raws to analyse each prediction and chose the
                                             # Majority of the prediction
        
    row_ = row.as_matrix() # Tranform row in np array
    unique, counts = np.unique(row_, return_counts=True) # We get set of ratings and their respective count repetition
    index_of_max = np.where(counts == np.max(counts)) # We select the index of the max count
    max_ = unique[index_of_max]
    if max_.shape[0]>1: # If there are same amount of max count, we chose the one with the highest score as we saw
                        # that the rating distribution is shifter upwards there is 
        max_ = max_[-1]
    only_prediction_df.loc[i,'Majority'] = max_

display(only_prediction_df.head())

In [None]:
only_prediction_df.to_csv('only_pred.csv')
prediction_df.to_csv('all_pred.csv')

# For submission

In [None]:
########### Define: algo, dataset (trainset ##############
ratings_ = formating_data_surprise(ratings)
trainset = ratings_.build_full_trainset()


In [None]:
########### Define: testset ##############
test_ratings_ = formating_data_surprise(test)
test_trainset = test_ratings_.build_full_trainset()


In [None]:
testset = test_trainset.build_testset()

In [None]:
########## Train and test the algo ###########

#n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.2

#algorithm = SVD(n_epochs = 20, lr_all = 0.002, reg_all = 0.2)
algorithm = SVD()
algorithm.train(trainset)

In [None]:
pred = algorithm.test(testset)

In [None]:
########## Creation of the lists: row_users, col_movies, estim ########
row_users = [] # initialization of the list row_users
col_movies = [] # initialization of the list col_movies
estim = [] # initialization of the list estim
for p in pred: # To loop over the prediction done by the algo on the test set
    row_users.append(p.uid) # fill this list with the indices of the users
    col_movies.append(p.iid) # fill this list with the indices of the movies
    estim.append(p.est) # fill this list with the ratings


In [None]:
######### Create the CSV files ##########
name = 'SVD.csv' # Name of the file

#estim = only_prediction_df['Majority'] # This is only here in order to use the estimation done previously

#estim = prediction_df['prediction'].mean(axis = 1)
#print(estim.shape)
create_csv_submission(row_users, col_movies, estim, name) # To create the CSV file 
    

In [None]:
print(len(pred))

# DON'T CARE

### Question

   - Il est normal de mettre un threshold (min_num_ratings) parce qu'on veut un certain nombre de données pour ce prononcer quand à donner une note, ceci implique qu'on enlève des users et des items, du coup notre matrice des ratings va être plus petite. Quand on la remplis et on fait un submit, comment est-ce qu'on gére ça?
       - Ici je suppose que c'est bien de faire la selection des ratings pour le train et test --> comme ceci on est pas ou moins biaisé par les movies et users qui n'ont que des 0 et qui donc n'apporte rien apart du "bruit"
       - Pour le remplissage de ce que l'on doit submit j'ai fait une petite comparaison entre les data que l'on nous donne et ce qui se trouve dans le sample_submission, (enfin je vais le faire) 

In [None]:
#********** Pre-processing **********
min_num_ratings = 10 # or 15 this is based on the information given above
ratings_valid = valid_ratings(ratings, num_items_per_user, num_users_per_item,min_num_ratings)
train, test = split_data(ratings_valid) # This will put 90% of the items for the users that have at least one non-zero entry and the 10% in test


In [None]:
cross_validation_application(train ,False ,4,0.01, 20, 0.2, 0.3,1)

In [None]:
#********* Try ************
#rmse = rmse_movie_mean(train, test)
rmse = matrix_factorization_SGD(train, test)

In [None]:
rmse