In [None]:
import numpy as np
import pandas as pd
import re # Used for the conversion of "r##_c##" in only the numbers --> TODO: check where it comes from
from IPython.display import display
from helpers import *
from play_with_data import *
from pre_processing import *
from matrix_factorization import *
from cross_validation import *
from apply_classifiers import *
from trainings_submissions import *
from regressions_models import *
from majority_mean import *
import scipy.sparse as sp # In order to use sparse 
# Predictors imported in performance order (best to worst, according to http://surpriselib.com/)
from surprise import SVDpp
from surprise import KNNBaseline
from surprise import SVD
from surprise import SlopeOne
from surprise import BaselineOnly
from surprise import KNNWithMeans
from surprise import NMF
from surprise import CoClustering
from surprise import KNNBasic
from surprise import KNNWithZScore # not scored --> to be tested quickly
from surprise import dataset
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import GridSearch
from surprise import accuracy
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

%load_ext autoreload
%autoreload 2

# Load the given data

In [None]:
#******** Creation of a sparse matrix of the data (training set)**********
ratings = load_data('./data_train.csv')

In [None]:
#******** Creatuib of a sparse matrix of the data (test set) ********
test = load_data('./sample_submission.csv')

# Preview of the given data and some statistics
- We load the training data with another method in order to do some statistics
- All we do here is in order to learn more about the given data

In [None]:
#********* Load the given data **********
r_c, x = load_data_old('./data_train.csv') #r_c contains the position (userID_movieID) and x contains the ratings

In [None]:
#********* Creation of a matrix of the data ********
nUser = 10000 # These numbers were given
nItem = 1000 # These numbers were given
data = np.zeros([nUser, nItem]) # Initialization of the matrix

for ind, i in enumerate(r_c): # Loop over all the IDs, in order to create a numpy matrix
    data[int(re.findall('\d+', i)[0])-1, int(re.findall('\d+', i)[1])-1] = x[ind] # Use the information in the ID (row, col) to create the matrix




In [None]:
#********** Data preview ************
# Check if there is any missed data 
# It was told us that we have the data from 10'000 users for 1'000 films, but we don't have all these ratings
info_general(nUser, nItem, x, data) # Call of a method that will print some general information about the data
print('\n')
info_ratings(data) # Call a method that will print some information about the ratings


With this barplot, we can see that ratings are not distributed in an uniform way, this may suggest that there is a bias in the rating matrix that has to be considered.

In [None]:
#***** Data preview (cont'd) *********
# Information about the number of ratings for the users and for the movies
num_movies_per_user, num_users_per_movie = plot_raw_data(ratings) # Original code is from the course, ex10 'plots.py'
print("Maximum number of movies per user:\t{}\nMinimum number of movies per user:\t{}\n".format(np.max(num_movies_per_user), np.min(num_movies_per_user)))
print("Maximum number of users per movie:\t{}\nMinimum number of users per movie:\t{}".format(np.max(num_users_per_movie), np.min(num_users_per_movie)))

# Algorithms from "Surprise"
- First, cross validation on the training set in order to have an idea of the performance of each algorithms WITHOUT any optimization
- Second, trying to optimize the algorithms using Grid Search (from Surprise) --> TODO: Ivan?

## Cross validation to evaluate the performance of the algorithms (without any optimization)

In [None]:
#*********** Formating the data correctly for Surprise + Cross Validation *************
ratings_surpr = formating_data_surprise(ratings) # Call a method that will transform the ratings in the right format
ratings_surpr.split(n_folds=3) # Will create the 3 folds for cross validation


In [None]:
################ Evaluate all the algorithms ########################
algos = [SVDpp(),KNNBaseline(),NMF(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]
perf = {}
algo_str = ['SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly','KNNWithZScore']

for i,algo in enumerate(algos): # Loop over the algorithms 
    # Evaluate performances of "Surprise" algorithm on the dataset
    perf[algo_str[i]] = evaluate(algo, ratings_surpr, measures=['RMSE']) # Evaluate the performance of each algo by cross validation 
    print_perf(perf[algo_str[i]]) # Print the performance for each algo 


In [None]:
#----------- SAVE -----------------
# Uncomment the line just below if you want to save the variable
#np.save('perf_dictionary.npy', perf) # Saving the dictionary that contains the RMSE of all the algos evaluated above

## Optimization of the algorithms 

### Grid search
- Optimization of the parameters for a given algorithm with given parameters
- TODO: IVAN'S code

In [None]:
#********** Define the parameters' grid and the grid search ***********
param_grid = {'init_mean': [0,2,4], 
              'init_std_dev': [0.1, 0.3, 0.5],
              'lr_all': [0.002, 0.004, 0.006, 0.008, 0.01],
              'reg_all': [0.01, 0.03, 0.05, 0.07, 0.1]}

grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'FCP'],
                         verbose=False)

ratings_ = formating_data_surprise(ratings) # Formating the ratings in the correct format for Surprise


# First train an SVD algorithm on the 
ratings_.split(n_folds=3) # Creation of the 3 Folds
grid_search.evaluate(ratings_) # Run the grid search on the cross-validation

# best RMSE score
print(grid_search.best_score['RMSE']) # Will print the lowest RMSE

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE']) # Will print the parameters that give the best (lowest) RMSE


# Use algorithms' predictions to find the best predicted ratings

First we set up all the variables, algorithms, regressors and classifiers

1. Majority vote
2. Mean of the predictions
3. Apply classifiers on the predictions (from sklearn)
4. Apply regressors on the predictions (from sklearn)
5. TODO: algorithm Home made??? --> If yes, add it in the "Structure of the code"

Then we will choose the best one of the five cited just above and try to optimize some parameters to have an even better prediction

## Set up everything (algorithms, regressors and classifiers)

In [None]:
############# Algos with optimized parameters ##############
#SVD(n_factors=100,lr_all=0.001,reg_all=10**(-1.5))
#BaselineOnly(bsl_options={'method': 'als', 'reg_u': 14.4, 'reg_i': 0.3})
#KNNWithZScore(k=100, min_k=7, sim_options={'name':'pearson_baseline','user_based':False,'shrinkage':500})
#KNNBaseline(k=96, min_k=8,sim_options={'name': 'pearson_baseline','user_based': False,'shrinkage': 500},bsl_options={'method': 'als','reg_u': 14.4,'reg_i': 0.3})
#NMF(n_factors=35,reg_pu=10**(-1.5),reg_qi=10**(-0.5))
# TODO --> put the correct parameters

#****** Options ********
sim_options = {'name': 'pearson_baseline',
               'user_based': False ,
               'shrinkage': 500
              }
#***** All the algorithms we are using (optimized) *******
algos = [SVDpp(n_factors=10,lr_all=0.00177827941004,reg_all=0.001),
         KNNBaseline(k=96, min_k=8,sim_options={'name': 'pearson_baseline','user_based': False,'shrinkage': 500},bsl_options={'method': 'als','reg_u': 14.4,'reg_i': 0.3}),
         NMF(n_factors=35,reg_pu=10**(-1.5),reg_qi=10**(-0.5)),
         SVD(n_factors=100,lr_all=0.001,reg_all=10**(-1.5)),
         SlopeOne(),
         BaselineOnly(bsl_options={'method': 'als', 'reg_u': 14.4, 'reg_i': 0.3}),
         KNNWithZScore(k=100, min_k=7, sim_options={'name':'pearson_baseline','user_based':False,'shrinkage':500})
        ]


############ Define some variables #################
columns_name = ['SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly','KNNWithZScore'] # Name of the algorithms, useful for the dataframe

#*********** Regressors ***************
regressions_method = [linear_model.LinearRegression(), 
                      linear_model.Ridge(), 
                      linear_model.Lasso(), 
                      linear_model.BayesianRidge(),
                      linear_model.ElasticNet(),
                      linear_model.HuberRegressor(),
                      linear_model.LassoLars(),
                      linear_model.PassiveAggressiveRegressor(),
                      linear_model.SGDRegressor()
                     ]

#*********** Classifiers ***************
classifiers_method = [naive_bayes,
                      kNearestNeigh, 
                      decision_tree, 
                      neural_net, 
                      support_vectorMachine, 
                      discr_analysis, 
                      lin_discr_analysis
                     ]


In [None]:
# ============= TRAINING ====================
#********* Creation of train and validation ratings *********
train_ratings, validation_ratings = split_data(ratings, prob_test=0.3) # splitting in train and validation set

#********* Do the training with all the algos ****************
print('-----START -----\nTraining of the algos\n')
algos_trained = first_train(train_ratings,algos) # Call function first_train
second_df, moviesID_userID_df = second_train_df(prediction_df, columns_name) # Call function second_train_df


In [None]:
# ============= Validation set up ============
#******** prepare the validation set *********
validation_df, validation_surprise = formating_data_surprise(validation_ratings, True) # Formating the data in order to use Surprise
validation_set = validation_surprise.build_full_trainset() # Build trainset
validation_set_pred = validation_set.build_testset() # Build iterable object in order to test 

prediction_validation_df = validation_df.copy() # Initialization of the DataFrame we will return
######### Predictions by the trained algorithms #############
for i, algo_t in enumerate(algos_trained): # Loop over all the trained algorithms
    pred = algo_t.test(validation_set_pred) # Make the prediction

    ########## Creation of the list: estim ########
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings

    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_validation_df = pd.concat([prediction_validation_df,temp], axis=1)
first_col = ['movies ID', 'Label', 'users ID']
all_col = first_col + columns_name
prediction_validation_df.columns = all_col
# prediction_validation_df contains "movies ID", "users ID", "ratings" and the "prediction"


In [None]:
print('This is the data we get after the predictions on the validation set')
display(prediction_validation_df.head())
print('\nThis is the data we get after the predictions on the validation set and after just removed "movies ID" and "users ID"')
prediction_label_df = prediction_validation_df.copy()
prediction_label_df = prediction_label_df.drop(['movies ID', 'users ID'], axis = 1)
display(prediction_label_df.head())
#----------- SAVE -----------------
# Uncomment thes lines just below if you want to save the variable

#prediction_validation_df.to_csv('prediction_df_validation.csv')
#np.save('algos_trained_training', algos_trained)

## 1. Majority vote
- We use the algorithms trained on the training set to create predictions on the validation set. Then we apply the majority vote algorithm in order to obtain a final prediction. We also compute the rmse with the real ratings

In [None]:
#************* Apply majority on the validation set *************
validation_pred_majority, validation_rmse_majority = majority_vote(prediction_validation_df)

In [None]:
print('RMSE obtained on the validation set with Majority vote: {}'.format(validation_rmse_majority))

## 2. Mean vote
- We use the algorithms trained on the training set to create predictions on the validation set. Then we apply the mean vote algorithm in order to obtain a final prediction. We also compute the rmse with the real ratings

In [None]:
#************* Apply mean on the validation set *************
validation_pred_mean, validation_rmse_mean = mean_vote(prediction_validation_df)

In [None]:
print('RMSE obtained on the validation set with Mean vote: {}'.format(validation_rmse_mean))

## 3. Classifiers on predictions
- We use the predictions done by the trained (optimized) algorithms and try to apply several different classifiers on them.
- We will choose the best classifier based on the RMSE

In [None]:
#********* Find the best classifier (with the lowest RMSE) **********
print('-----START -----\nSelection of classifier\n')

clf, best_rmse_clf = apply_classifier(prediction_label_df, classifiers_method) # Call a method that will find the best classifier and its RMSE


In [None]:
######## Print the best classifier and its RMSE ############
print(clf)
print('\nRMSE obtained with the best classifier: {}'.format(best_rmse_clf))


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

RMSE obtained with the best classifier: 1.4029965619933586

## 4. Regressors on predictions
- We use the predictions done by the trained (optimized) algorithms and try to apply several different regressors on them.
- We will choose the best regressor based on the RMSE

In [None]:
#********* Find the best classifier (with the lowest RMSE) **********
print('-----START -----\nSelection of regressor\n')
reg, best_rmse_reg = lin_regressors(prediction_label_df, regressions_method)

In [None]:
######## Print the best regressor and its RMSE ############
print(reg)
print('\nRMSE obtained with the best regressor: {}'.format(best_rmse_reg))
print('\nThe coefficients are: {}'.format(reg.coef_))

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

RMSE obtained with the best regressor: 0.982891103210869

## Observations
- We can clearly see that the regression technique is the one that works better
- So now we will try to improve the best regressor by optimizing the parameters of this classifier

# ============================================

# ---------- NEW -------------
Should be where there is everything done

In [None]:
#******** Train the best classifier and the best regressor on the whole training set **********
training_prediction_set, training_prediction_label = get_label_predictions(second_df) 

clf.fit(training_prediction_set, training_prediction_label)
reg.fit(training_prediction_set, training_prediction_label)

In [None]:
############### Creation of Submission ##############
prediction_test_df = first_train(ratings,algos, test,submit = True) # Training on the whole set  
train_df, moviesID_userID_df = second_train_df(prediction_test_df, columns_name) # Get the predictions from the algos

onlyPrediction = train_df.drop(train_df.columns[[0]], axis=1)

y_prediction_clf = clf.predict(onlyPrediction) # Apply the classifier
y_prediction_reg = onlyPrediction.dot(reg.coef_)
moviesID_userID_df['Prediction'] = y_prediction_clf # Add a prediction columns

######### Create the CSV files ##########
name = 'prediction_clf.csv' # Name of the file
create_csv_submission(moviesID_userID_df['users ID'], moviesID_userID_df['movies ID'], moviesID_userID_df['Prediction'], name) # To create the CSV file

name = 'prediction_reg.csv'
moviesID_userID_df['Prediction'] = y_prediction_reg
create_csv_submission(moviesID_userID_df['users ID'], moviesID_userID_df['movies ID'], moviesID_userID_df['Prediction'], name) # To create the CSV file


In [None]:
# SAVE
#prediction_test_df.to_csv('prediction_test_df_createSub.csv')
#train_df.to_csv('train_df_createSub.csv')
#moviesID_userID_df.to_csv('moviesID_userID_df_createSub.csv')

## Optimization
- We take the best regressor and now we will try to find the best parameters for this regressor

In [None]:
## TODO /////////////// DELETE \\\\\\\\\\\\\\\\\\\\\\\\\\\

second_df_imp = pd.read_csv('./ALL_SAVE_BEST_RUN/second_df_training.csv')
# NEED TO REDUC THE SIZE because of the "read_csv" which add a first column...
second_df = second_df_imp.copy()
second_df = second_df.drop(second_df.columns.values[0], axis=1)
col_one_df = pd.DataFrame(np.ones(second_df.shape[0]))
second_df = pd.concat([col_one_df, second_df], axis=1)
display(second_df.head())





In [None]:
############# Define the best regressor and some parameters ############
regressor_toOptimize = linear_model.SGDRegressor()
hyperparams = {'alpha': np.logspace(-6,-2,5) , 
               'l1_ratio': np.linspace(0,0.6,5),  
               'epsilon': np.logspace(-3,1,5)
               }
#hyperparams = {'epsilon': [1,2,3,4,5,6], 'alpha': [0.0001, 0.001,0.01, 0.1], 'tol': [1e-400,1e-100, 1e-6, 1e-5, 1e-4, 1e-2] }
#hyperparams = {'epsilon': [1, 1.35, 1.6], 'alpha': [0.00001, 0.0001, 0.001], 'tol': [1e-04,1e-05, 1e-06] }
#hyperparams = {'epsilon': [1, 1.35, 1.6]}

In [None]:
############# Do the Grid Search optimization ###########
regressor_optimized = optimization_regressor(second_df, regressor_toOptimize, hyperparams)
print(regressor_optimized) # will show the best param and also all the possible values of the parameters

In [None]:
print('These are the best parameters: {}'.format(regressor_optimized.best_params_))

# Uncomment if you want to save 
#np.save('best_regressor_param', regressor_optimized.best_params_)

# STOP HERE --> because now we have everything we need to define our final model and do a prediction

## Here I think we can just call "run()" and it is done!

### Then we need to define how we want to put all the "optimization" steps

In [None]:
####### Call our final model 
from run import *
run()

In [None]:
from run_2 import *
run_2()

# START RUN

In [None]:
############# Load the data #############
print('Loading the data')
ratings = load_data('./data_train.csv') # Load the training set
test = load_data('./sample_submission.csv') # Load the test set, in order to create the submission file later

############# Apply algorithms from Surprise #############
print('Initializing the algorithms')
#************ Initialize the lists of the algorithms with parameters already optimized ************
algorithms = [SVDpp(n_factors=10,lr_all=0.00177827941004,reg_all=0.001),
         KNNBaseline(k=96, min_k=8,sim_options={'name': 'pearson_baseline','user_based': False,'shrinkage': 500},
                     bsl_options={'method': 'als','reg_u': 14.4,'reg_i': 0.3}),
         NMF(n_factors=35,reg_pu=10**(-1.5),reg_qi=10**(-0.5)),
         SVD(n_factors=100,lr_all=0.001,reg_all=10**(-1.5)),
         SlopeOne(),
         BaselineOnly(bsl_options={'method': 'als', 'reg_u': 14.4, 'reg_i': 0.3}),
         KNNWithZScore(k=100, min_k=7, sim_options={'name':'pearson_baseline','user_based':False,'shrinkage':500})
         ]

columns_name = ['SVDpp', # This list is usefull to define the name of the columns of the predictions
                'KNNBaseline',
                'NMF',
                'SVD',
                'SlopeOne',
                'BaselineOnly',
                'KNNWithZScore'
                ]

In [None]:
#*********** TRAINING **************

#----------- Splitting ------------
train_algo_ratings, train_reg_ratings = split_data(ratings, prob_test=0.3) # splitting in train and test set
#----------- Training the algorithms on "train_algo_ratings" and apply them on "train_reg_ratings" ---------------
print('Training of the algorithms')
algos_trained = first_train(train_algo_ratings, algorithms) # Trained the algorithms on the "train_algo_ratings" set
#algos_trained = np.load('algos_trained_training.npy')
#******** prepare the validation set *********
print('---- Start the predictions -----')
train_reg_df, train_reg_surprise = formating_data_surprise(train_reg_ratings, True) # Formating the data in order to use Surprise
train_reg_set = train_reg_surprise.build_full_trainset() # Build trainset
reg_set_pred = train_reg_set.build_testset() # Build iterable object in order to test

prediction_reg_df = train_reg_df.copy() # Initialization of the DataFrame we will return
######### Predictions by the trained algorithms #############
for i, algo_t in enumerate(algos_trained): # Loop over all the trained algorithms
    pred = algo_t.test(reg_set_pred) # Make the prediction

    ########## Creation of the list: estim ########
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings

    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_reg_df = pd.concat([prediction_reg_df,temp], axis=1)
first_col = ['movies ID', 'Label', 'users ID'] # In order to put the right name on the columns
all_col = first_col + columns_name # In order to put the right name on the columns
prediction_reg_df.columns = all_col # In order to put the right name on the columns
print('---- End of the predictions -----')

In [None]:
#----------- Training the regressor on the predictions on the "train_reg_ratings" ---------------

prediction_reg_cleaned = prediction_reg_df.copy() # Copy the original data, keep it intact
prediction_reg_cleaned = prediction_reg_cleaned.drop(['movies ID', 'users ID'], axis = 1) # Remove the columns we don't need

print('Apply the regressor')
regressor =  linear_model.SGDRegressor(alpha = 0.0001, epsilon= 0.01, l1_ratio= 0.3) # This is the best regressor we've found and optimized
#regressor = linear_model.HuberRegressor()
training_predictions_set, training_predictions_label = get_label_predictions(prediction_reg_cleaned) # Take the predictions and the labels

#___________ Adding the offset parameter (column of 1) __________________
col_one = pd.DataFrame(np.ones(training_predictions_set.shape[0])) # Create a column of ones (offset parameter)
training_predictions_set= pd.DataFrame(training_predictions_set) # Put the training_prediction_set in Dataframe type
training_predictions_set = pd.concat([col_one, training_predictions_set], axis=1) # Add the column of 1, the offset at the prediction set

#___________ Training of the regressor _______________
regressor.fit(training_predictions_set, training_predictions_label) # Here we do the training, find the weights of the regression


In [None]:
############ Predict the unknown ratings #########
print('------ Start the predictions on the unknown --------')
#*********** Prepare the test set ***************
test_df, test_surprise = formating_data_surprise(test, True) #Put the data in the correct format
test_set = test_surprise.build_full_trainset() # Build trainset
test_set_pred = test_set.build_testset() # Build iterable object in order to test
print('\tApply the algorithms')
prediction_test_df = test_df.copy() # Initialization of the DataFrame we will return
#*********** Prediction **************
for i, algo_t in enumerate(algos_trained): # Loop over all the trained algorithms
    pred = algo_t.test(test_set_pred) # Make the prediction

    #_________ Creation of the list: estim __________
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings

    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_test_df = pd.concat([prediction_test_df,temp], axis=1)
first_col = ['movies ID', 'Label', 'users ID'] # In order to put the right name on the columns
all_col = first_col + columns_name # In order to put the right name on the columns
prediction_test_df.columns = all_col # In order to put the right name on the columns

In [None]:
#___________ Remove the not wanted columns _______________
predictions_only = prediction_test_df.copy() # Copy in order to not act on the original one
predictions_only = predictions_only.drop(['movies ID','users ID', 'Label'], axis = 1) # remove the "label", "movies ID" and "users ID" column, keep only the predictions

#___________ Adding the offset parameter (column of 1) __________________
col_one_unknown = pd.DataFrame(np.ones(predictions_only.shape[0])) # Create a column of ones (offset parameter)
predictions_only = pd.concat([col_one_unknown, predictions_only], axis=1) # Add the column of 1, the offset at the prediction set

#----------- Apply regression on the predictions of the unknown ratings ------------------
print('\tApply the regressor')
moviesID_usersID_prediction = prediction_test_df[['movies ID','users ID']]


predicted = predictions_only.dot(regressor.coef_) # Compute the predictions of the unknown ratings
moviesID_usersID_prediction['Prediction'] = predicted # Now the variable "movies_usersID_df" contains all the values we need to create the submission file

############ Creation of the submission file #############
print('Create the submission file')
name = 'best_submission.csv'
create_csv_submission(moviesID_usersID_prediction['users ID'], moviesID_usersID_prediction['movies ID'], moviesID_usersID_prediction['Prediction'], name)


In [None]:
prediction_test_df.to_csv('prediction_test_df')

# END OF RUN

In [None]:
############## Use the optimized regressor ###########
#reg_opt = linear_model.HuberRegressor(alpha=0.0001, epsilon=1.5, tol=1e-05)
reg_opt = linear_model.HuberRegressor(alpha=0.01, epsilon=5, tol=1e-400)
print(reg_opt)

training_prediction_set, training_prediction_label = get_label_predictions(second_df)
col_one_np = np.ones(training_prediction_set.shape[0])

col_one = pd.DataFrame(col_one_np)
training_prediction_set= pd.DataFrame(training_prediction_set)
training_prediction_set = pd.concat([col_one, training_prediction_set], axis=1)
print(training_prediction_set.head())
reg_opt.fit(training_prediction_set, training_prediction_label)

In [None]:
print(reg_opt.coef_)
print(training_prediction_set.shape)
print(training_prediction_label.shape)


In [None]:
############### Creation of Submission ##############
prediction_test_df = first_train(ratings,algos, test,submit = True) # Training on the whole set  
train_df, moviesID_userID_df = second_train_df(prediction_test_df, columns_name) # Get the predictions from the algos



In [None]:
onlyPrediction = train_df.drop(train_df.columns[[0]], axis=1)
col_one_sub = pd.DataFrame(np.ones(onlyPrediction.shape[0]))
onlyPrediction = pd.concat([col_one_sub, onlyPrediction], axis = 1)
print(onlyPrediction.head())
print(onlyPrediction[[0]])

y_prediction_reg = onlyPrediction.dot(reg_opt.coef_)

name = 'prediction_regOPTIMIZED_final_with1.csv'
moviesID_userID_df['Prediction'] = y_prediction_reg
create_csv_submission(moviesID_userID_df['users ID'], moviesID_userID_df['movies ID'], moviesID_userID_df['Prediction'], name) # To create the CSV file


# ---------- END of new -------------

# For submission

In [None]:
########### Define: algo, dataset (trainset ##############
ratings_ = formating_data_surprise(ratings)
trainset = ratings_.build_full_trainset()


In [None]:
########### Define: testset ##############
dataF_test_ratings_, test_ratings_ = formating_data_surprise(test, True)
test_trainset = test_ratings_.build_full_trainset()


In [None]:
testset = test_trainset.build_testset()

In [None]:
########## STEF ADD ###################
########## Train and test the algo ###########
algorithm = [SVDpp(),KNNBaseline(),NMF(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]

prediction_df = dataF_test_ratings_.copy()
for i, algo in enumerate (algorithm):
    
    algo.train(trainset) # Training of the algo
    pred = algo.test(testset) # Make the prediction

    ########## Creation of the lists: row_users, col_movies, estim ########
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings
    
    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_df = pd.concat([prediction_df,temp], axis=1)

In [None]:
######### STEF ADD ################
######### Create the submission #########
#******** Take the prediction of the algos ********
prediction_df_clean = prediction_df.copy() # copy in order to not modify the original 
prediction_df_clean = prediction_df_clean.drop(prediction_df_clean.columns.values[0:3], axis = 1) # remove the columns that we don't want

#******** Apply the classifier ***********
prediction_clas = clf.predict(prediction_df_clean) # Apply the classifier on the predictions

#******** Prepare the variables for submission *********
usersID = prediction_df['users ID'] # To have the user ID
moviesID = prediction_df['movies ID'] # To have the movies ID
name = 'all_algos_SVM_noOptimization.csv' # The name of the csv file
create_csv_submission(usersID, moviesID, prediction_clas, name)

In [None]:
########## Train and test the algo ###########

#n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.2

#algorithm = SVD(n_epochs = 20, lr_all = 0.002, reg_all = 0.2)
sim_options = {'name': 'pearson_baseline',
               'user_based': False ,
               'shrinkage': 500
              }
algorithm_sim = KNNBaseline(sim_options=sim_options)

algorithm_sim.train(trainset)

In [None]:
pred = algorithm_sim.test(testset)

In [None]:
########## Creation of the lists: row_users, col_movies, estim ########
row_users = [] # initialization of the list row_users
col_movies = [] # initialization of the list col_movies
estim = [] # initialization of the list estim
for p in pred: # To loop over the prediction done by the algo on the test set
    row_users.append(p.uid) # fill this list with the indices of the users
    col_movies.append(p.iid) # fill this list with the indices of the movies
    estim.append(p.est) # fill this list with the ratings


In [None]:
######### Create the CSV files ##########
name = 'KNNBaseline.csv' # Name of the file

#estim = only_prediction_df['Majority'] # This is only here in order to use the estimation done previously

#estim = prediction_df['prediction'].mean(axis = 1)
#print(estim.shape)
create_csv_submission(row_users, col_movies, estim, name) # To create the CSV file 
    

In [None]:
print(len(pred))

### Tests with different similarities

In [None]:
########### Define: algo, dataset (trainset ##############
ratings_bsl = formating_data_surprise(ratings)
#trainset_bsl = ratings_bsl.build_full_trainset()

In [None]:
ratings_bsl.split(n_folds=6)

In [None]:
algorithm1 = BaselineOnly()

for trainset1, testset1 in ratings_bsl.folds():

    # train and test algorithm.
    algorithm1.train(trainset1)
    predictions1 = algorithm1.test(testset1)

    # Compute and print Root Mean Squared Error
    rmse1 = accuracy.rmse(predictions1, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
perf = evaluate(algorithm1, ratings_bsl, measures=['RMSE'])

print_perf(perf)

In [None]:
bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_u': 10,
               'reg_i': 25
               }

In [None]:
algorithm_bsl = BaselineOnly(bsl_options=bsl_options)


for trainset1, testset1 in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_bsl.train(trainset1)
    predictions1 = algorithm_bsl.test(testset1)

    # Compute and print Root Mean Squared Error
    rmse_bsl = accuracy.rmse(predictions1, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
perf = evaluate(algorithm_bsl, ratings_bsl, measures=['RMSE'])

print_perf(perf)

In [None]:
### LONG

algorithm2 = KNNBaseline

for trainset2, testset2 in ratings_bsl.folds():

    # train and test algorithm.
    algorithm2.train(trainset2)
    predictions2 = algorithm2.test(testset2)

    # Compute and print Root Mean Squared Error
    rmse2 = accuracy.rmse(predictions2, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'user_based': False
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_item = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True ,
               'shrinkage': 0
               }

In [None]:
##### LONG

algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'user_based': False
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears_user = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False ,
               'shrinkage': 500
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears_user = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False 
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears_user = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson',
               'user_based': False
               }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'cosine',
               'user_based': False
               }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.