In [None]:
import numpy as np
import pandas as pd
import re # Used for the conversion of "r##_c##" in only the numbers --> TODO: check where it comes from
from IPython.display import display
from helpers import *
from play_with_data import *
from pre_processing import *
from matrix_factorization import *
from cross_validation import *
from apply_classifiers import *
from trainings_submissions import *
from regressions_models import *
from majority_mean import *
import scipy.sparse as sp # In order to use sparse 
# Predictors imported in performance order (best to worst, according to http://surpriselib.com/)
from surprise import SVDpp
from surprise import KNNBaseline
from surprise import SVD
from surprise import SlopeOne
from surprise import BaselineOnly
from surprise import KNNWithMeans
from surprise import NMF
from surprise import CoClustering
from surprise import KNNBasic
from surprise import KNNWithZScore # not scored --> to be tested quickly
from surprise import dataset
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import GridSearch
from surprise import accuracy
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

%load_ext autoreload
%autoreload 2

# Load the given data

In [None]:
#******** Creation of a sparse matrix of the data (training set)**********
ratings = load_data('./data_train.csv')

In [None]:
#******** Creatuib of a sparse matrix of the data (test set) ********
test = load_data('./sample_submission.csv')

# Preview of the given data and some statistics
- We load the training data with another method in order to do some statistics
- All we do here is in order to learn more about the given data

In [None]:
#********* Load the given data **********
r_c, x = load_data_old('./data_train.csv') #r_c contains the position (userID_movieID) and x contains the ratings

In [None]:
#********* Creation of a matrix of the data ********
nUser = 10000 # These numbers were given
nItem = 1000 # These numbers were given
data = np.zeros([nUser, nItem]) # Initialization of the matrix

for ind, i in enumerate(r_c): # Loop over all the IDs, in order to create a numpy matrix
    data[int(re.findall('\d+', i)[0])-1, int(re.findall('\d+', i)[1])-1] = x[ind] # Use the information in the ID (row, col) to create the matrix




In [None]:
#********** Data preview ************
# Check if there is any missed data 
# It was told us that we have the data from 10'000 users for 1'000 films, but we don't have all these ratings
info_general(nUser, nItem, x, data) # Call of a method that will print some general information about the data
print('\n')
info_ratings(data) # Call a method that will print some information about the ratings


With this barplot, we can see that ratings are not distributed in an uniform way, this may suggest that there is a bias in the rating matrix that has to be considered.

In [None]:
#***** Data preview (cont'd) *********
# Information about the number of ratings for the users and for the movies
num_movies_per_user, num_users_per_movie = plot_raw_data(ratings) # Original code is from the course, ex10 'plots.py'
print("Maximum number of movies per user:\t{}\nMinimum number of movies per user:\t{}\n".format(np.max(num_movies_per_user), np.min(num_movies_per_user)))
print("Maximum number of users per movie:\t{}\nMinimum number of users per movie:\t{}".format(np.max(num_users_per_movie), np.min(num_users_per_movie)))

# Algorithms from "Surprise"
- First, cross validation on the training set in order to have an idea of the performance of each algorithms WITHOUT any optimization
- Then we optimized the algorithms, this can be seen in the notebook: algorithm_optimization.ipynb
- After that we do again the cross validation on the training set in order to show the performance of each optimized algorithms 

## Cross validation to evaluate the performance of the algorithms (without any optimization)

In [None]:
#*********** Formating the data correctly for Surprise + Cross Validation *************
ratings_surpr = formating_data_surprise(ratings) # Call a method that will transform the ratings in the right format
ratings_surpr.split(n_folds=3) # Will create the 3 folds for cross validation


In [None]:
################ Evaluate all the algorithms ########################
algos = [SVDpp(),KNNBaseline(),NMF(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]
perf = {}
algo_str = ['SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly','KNNWithZScore']

for i,algo in enumerate(algos): # Loop over the algorithms 
    # Evaluate performances of "Surprise" algorithm on the dataset
    perf[algo_str[i]] = evaluate(algo, ratings_surpr, measures=['RMSE']) # Evaluate the performance of each algo by cross validation 
    print_perf(perf[algo_str[i]]) # Print the performance for each algo 


In [None]:
#----------- SAVE -----------------
# Uncomment the line just below if you want to save the variable
#np.save('perf_dictionary.npy', perf) # Saving the dictionary that contains the RMSE of all the algos evaluated above

## Optimization of the algorithms 

#### ---- Grid search and the optimisation is in the Jupyter Notebook: algorithm_optimization.ipynb

## Cross validation to evaluate the performance of the algorithms (with optimization) 

In [None]:
################ Evaluate all the algorithms ########################
algos = [SVDpp(n_factors=10,lr_all=0.00177827941004,reg_all=0.001),
         KNNBaseline(k=96, min_k=8,sim_options={'name': 'pearson_baseline','user_based': False,'shrinkage': 500},bsl_options={'method': 'als','reg_u': 14.4,'reg_i': 0.3}),
         NMF(n_factors=35,reg_pu=10**(-1.5),reg_qi=10**(-0.5)),
         SVD(n_factors=100,lr_all=0.001,reg_all=10**(-1.5)),
         SlopeOne(),
         BaselineOnly(bsl_options={'method': 'als', 'reg_u': 14.4, 'reg_i': 0.3}),
         KNNWithZScore(k=100, min_k=7, sim_options={'name':'pearson_baseline','user_based':False,'shrinkage':500})
        ]
perf = {}
algo_str = ['SVDpp_optimized','KNNBaseline_optimized','NMF_optimized','SVD_optimized','SlopeOne_optimized',
            'BaselineOnly_optimized','KNNWithZScore_optimized']

for i,algo in enumerate(algos): # Loop over the algorithms 
    # Evaluate performances of "Surprise" algorithm on the dataset
    perf[algo_str[i]] = evaluate(algo, ratings_surpr, measures=['RMSE']) # Evaluate the performance of each algo by cross validation 
    print_perf(perf[algo_str[i]]) # Print the performance for each algo 


# Use algorithms' predictions to find the best predicted ratings

First we set up all the variables, algorithms, regressors and classifiers
Then we apply these different strategies:
1. Majority vote
2. Mean of the predictions
3. Apply classifiers on the predictions (from sklearn)
4. Apply regressors on the predictions (from sklearn and XGBoost)

Then we will choose the best one of the four cited just above and try to optimize some parameters to have an even better prediction

What we will do in details:
- Define the algorithms with the best parameters (find with the optimization done in "algorithm_optimization.ipynb")
- Create a training set (70%) and a validation set (30%) from the "data_train.csv"
- Train the algorithms on the training set
- Apply the trained algorithms on the validation set in order to have the predictions for each algorithms
- Apply the different strategies quoted above in order to have the final prediction
    - The strategies are explained below


## Set up everything (algorithms, regressors and classifiers)

In [None]:
############# Algos with optimized parameters ##############

#***** All the algorithms we are using (optimized) *******
algos = [SVDpp(n_factors=10,lr_all=0.00177827941004,reg_all=0.001),
         KNNBaseline(k=96, min_k=8,sim_options={'name': 'pearson_baseline','user_based': False,'shrinkage': 500},bsl_options={'method': 'als','reg_u': 14.4,'reg_i': 0.3}),
         NMF(n_factors=35,reg_pu=10**(-1.5),reg_qi=10**(-0.5)),
         SVD(n_factors=100,lr_all=0.001,reg_all=10**(-1.5)),
         SlopeOne(),
         BaselineOnly(bsl_options={'method': 'als', 'reg_u': 14.4, 'reg_i': 0.3}),
         KNNWithZScore(k=100, min_k=7, sim_options={'name':'pearson_baseline','user_based':False,'shrinkage':500})
        ]


############ Define some variables #################
columns_name = ['SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly','KNNWithZScore'] # Name of the algorithms, useful for the dataframe

#*********** Regressors ***************
regressions_method = [XGBRegressor(),
                      linear_model.LinearRegression(), 
                      linear_model.Ridge(), 
                      linear_model.Lasso(), 
                      linear_model.BayesianRidge(),
                      linear_model.ElasticNet(),
                      linear_model.HuberRegressor(),
                      linear_model.LassoLars(),
                      linear_model.PassiveAggressiveRegressor(),
                      linear_model.SGDRegressor()
                     ]

#*********** Classifiers ***************
classifiers_method = [naive_bayes,
                      kNearestNeigh, 
                      decision_tree, 
                      neural_net, 
                      support_vectorMachine, 
                      discr_analysis, 
                      lin_discr_analysis
                     ]


In [None]:
# ============= TRAINING ====================
#********* Creation of train and validation ratings *********
train_ratings, validation_ratings = split_data(ratings, prob_test=0.3) # splitting in train and validation set

#********* Do the training with all the algos ****************
print('-----START -----\nTraining of the algos\n')
algos_trained = first_train(train_ratings,algos) # Call function first_train --> train the algorithms


In [None]:
# ============= Validation set up ============
#******** prepare the validation set *********
validation_df, validation_surprise = formating_data_surprise(validation_ratings, True) # Formating the data in order to use Surprise
validation_set = validation_surprise.build_full_trainset() # Build trainset
validation_set_pred = validation_set.build_testset() # Build iterable object in order to test 

prediction_validation_df = validation_df.copy() # Copy in order to keep the original intact 

######### Predictions by the trained algorithms #############
for i, algo_t in enumerate(algos_trained): # Loop over all the trained algorithms
    pred = algo_t.test(validation_set_pred) # Make the prediction

    ########## Creation of the list: estim ########
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings

    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_validation_df = pd.concat([prediction_validation_df,temp], axis=1) # Add all the predictions

first_col = ['movies ID', 'Label', 'users ID'] # Name of the first columns
all_col = first_col + columns_name # All the columns name
prediction_validation_df.columns = all_col # Put the names of the column



In [None]:
print('This is the data we get after the predictions on the validation set')
display(prediction_validation_df.head())
print('\nThis is the data we get after the predictions on the validation set and after just removed "movies ID" and "users ID"')
prediction_label_df = prediction_validation_df.copy()
prediction_label_df = prediction_label_df.drop(['movies ID', 'users ID'], axis = 1)
display(prediction_label_df.head())

#----------- SAVE -----------------
# Uncomment thes lines just below if you want to save the variable

#prediction_validation_df.to_csv('prediction_df_validation.csv')
#np.save('algos_trained_training', algos_trained)

## 1. Majority vote
- We use the algorithms trained on the training set to create predictions on the validation set. Then we apply the majority vote algorithm in order to obtain a final prediction. We also compute the rmse with the real ratings.

This strategy is quite simple and will choose the rating that is the most often predicted amongst the algorithms

In [None]:
#************* Apply majority on the validation set *************
validation_pred_majority, validation_rmse_majority = majority_vote(prediction_validation_df)

In [None]:
print('RMSE obtained on the validation set with Majority vote: {}'.format(validation_rmse_majority))

## 2. Mean vote
- We use the algorithms trained on the training set to create predictions on the validation set. Then we apply the mean vote algorithm in order to obtain a final prediction. We also compute the rmse with the real ratings.

This strategy is also quite simple, we take the mean of the ratings across all the algorithms for each user and movie pair.

In [None]:
#************* Apply mean on the validation set *************
validation_pred_mean, validation_rmse_mean = mean_vote(prediction_validation_df)

In [None]:
print('RMSE obtained on the validation set with Mean vote: {}'.format(validation_rmse_mean))

## 3. Classifiers on predictions
- We use the predictions done by the trained (optimized) algorithms and try to apply several different classifiers on them.
- We will choose the best classifier based on the RMSE

In [None]:
#********* Find the best classifier (with the lowest RMSE) **********
print('-----START -----\nSelection of classifier\n')

clf, best_rmse_clf = apply_classifier(prediction_label_df, classifiers_method) # Call a method that will find the best classifier and its RMSE


In [None]:
######## Print the best classifier and its RMSE ############
print(clf)
print('\nRMSE obtained with the best classifier: {}'.format(best_rmse_clf))

## 4. Regressors on predictions
- We use the predictions done by the trained (optimized) algorithms and try to apply several different regressors on them.
- We will choose the best regressor based on the RMSE

In [None]:
#********* Find the best classifier (with the lowest RMSE) **********
print('-----START -----\nSelection of regressor\n')
reg, best_rmse_reg = lin_regressors(prediction_label_df, regressions_method)

In [None]:
######## Print the best regressor and its RMSE ############
print(reg)
print('\nRMSE obtained with the best regressor: {}'.format(best_rmse_reg))
print('\nThe coefficients are: {}'.format(reg.coef_))

## Observations
- We can see that the regression technique is the one that works better, but we also see that when we run all the notebook multiple times the best regressor is not always the same. This is because in the split_data method we have a random function, so when we do the splitting there is a random effect that will not always be the same.
- So now we will try to improve the best regressor (obtained once) by optimizing the parameters of this regressor

## Optimization
- We take the best regressor obtained once we run the all notebook and now we will try to find the best parameters for this regressor

In [None]:
############# Define the best regressor and some parameters ############
regressor_toOptimize = linear_model.SGDRegressor()
hyperparams = {'alpha': np.logspace(-6,-2,5) , 
               'l1_ratio': np.linspace(0,0.6,5),  
               'epsilon': np.logspace(-3,1,5)
               }


In [None]:
############# Do the Grid Search optimization ###########

regressor_optimized = optimization_regressor(prediction_validation_df, regressor_toOptimize, hyperparams)
print(regressor_optimized) # will show the best param and also all the possible values of the parameters

In [None]:
print('These are the best parameters: {}'.format(regressor_optimized.best_params_))

#--------- SAVE -----------
# Uncomment if you want to save 
#np.save('best_regressor_param', regressor_optimized.best_params_)
#np.save('regressor_optimized', regressor_optimized)

# Best model
As explained in the report, after trying with quite complex strategies and not succeed to improve our score. We try with something much simpler. We use the optimized algorithm SVDpp trained on the whold training set.

You can run only this following cell.

In [None]:
####### Call our final model 
from run import *
run()