In [None]:
import numpy as np
import pandas as pd
import re # Used for the conversion of "r##_c##" in only the numbers --> TODO: check where it comes from
from IPython.display import display
from helpers import *
from play_with_data import *
from pre_processing import *
from matrix_factorization import *
from cross_validation import *
from apply_classifiers import *
from trainings_submissions import *
from regressions_models import *
import scipy.sparse as sp # In order to use sparse 
# Predictors imported in performance order (best to worst, according to http://surpriselib.com/)
from surprise import SVDpp
from surprise import KNNBaseline
from surprise import SVD
from surprise import SlopeOne
from surprise import BaselineOnly
from surprise import KNNWithMeans
from surprise import NMF
from surprise import CoClustering
from surprise import KNNBasic
from surprise import KNNWithZScore # not scored --> to be tested quickly
from surprise import dataset
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import GridSearch
from surprise import accuracy
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

%load_ext autoreload
%autoreload 2

In [None]:
#******** Creation of a sparse matrix of the data **********
ratings = load_data('./data_train.csv')

In [None]:
test = load_data('./sample_submission.csv')

# Done before the exercise
- This will create a numpy matrix from the data given "data_train.csv"
- Then we will use this to show some statistics about the data

In [None]:
#********* Load the given data **********
r_c, x = load_data_old('./data_train.csv') #r_c contain the position (user, movie) and x contain the ratings

In [None]:
#********* Creation of a matrix of the data ********
nUser = 10000
nItem = 1000
data = np.zeros([nUser, nItem]) # These numbers were given

for ind, i in enumerate(r_c): # Loop over all the ID, in order to create a numpy matrix
    data[int(re.findall('\d+', i)[0])-1, int(re.findall('\d+', i)[1])-1] = x[ind] # Use the information in the ID (row, col) to create the matrix




In [None]:
#********** Data preview ************
# Check if there is any missed data 
# It was told us that we have the data from 10'000 users for 1000 films, but we don't have all these data
info_general(nUser, nItem, x, data)
print('\n')
info_ratings(data)


With this barplot, we can see that ratings are not distributed in an uniform way, this may suggest that there is a bias in the rating matrix that has to be considered.

# Done after the exercise

In [None]:
#***** Data preview *********
num_items_per_user, num_users_per_item = plot_raw_data(ratings) # Original code is from the course, ex10 'plots.py'
print("Maximum number of items per user:\t{}\nMinimum number of items per user:\t{}\n".format(np.max(num_items_per_user), np.min(num_items_per_user)))
print("Maximum number of users per item:\t{}\nMinimum number of users per item:\t{}".format(np.max(num_users_per_item), np.min(num_users_per_item)))

# Done so far
- Preview of the data
- Pre-processing:
    - Choosing only the "valid ratings", the users and items that contains more than min_num_ratings
    - Splitting the data in test and train, by choosing 90% of the ratings from the valid_ratings and only the non-zeros values

In [None]:
ratings_surpr = formating_data_surprise(ratings)
ratings_surpr.split(n_folds=3)


In [None]:
################ Evaluate all the algorithms ########################
algos = [SVDpp(),KNNBaseline(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]
perf = {}
algo_str = ['SVDpp', 'KNN Baseline','SVD', 'Slope One', 'BaselineOnly', 'KNN with Z score']

for i,algo in enumerate(algos): #for algo in algos:
    # Evaluate performances of our algorithm on the dataset.
    perf[algo_str[i]] = evaluate(algo, ratings_surpr, measures=['RMSE'])
    print_perf(perf[algo_str[i]])


In [None]:
np.save('perf_dictionary.npy', perf)

# GRID SEARCH

In [None]:
param_grid = {'init_mean': [0,2,4], 
              'init_std_dev': [0.1, 0.3, 0.5],
              'lr_all': [0.002, 0.004, 0.006, 0.008, 0.01],
              'reg_all': [0.01, 0.03, 0.05, 0.07, 0.1]}

grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'FCP'],
                         verbose=False)


# First train an SVD algorithm on the movielens dataset.
ratings_ = formating_data_surprise(ratings)
#trainset = ratings_.build_full_trainset()

ratings_.split(n_folds=3)
grid_search.evaluate(ratings_)

# best RMSE score
print(grid_search.best_score['RMSE'])
# >>> 0.96117566386

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])
# >>> {'reg_all': 0.4, 'lr_all': 0.005, 'n_epochs': 10}

## Majority Creation


In [None]:
#******** Creation of a sparse matrix of the data ******
ratings = load_data('./data_train.csv')
train_ratings, test_ratings =split_data(ratings, prob_test=0.15) # splitting in train and test set 


########### Define: algo, dataset (trainset ##############
dataF_train, ratings_train = formating_data_surprise(train_ratings, True) # Create the Dataset for surprise (training set)

trainset_algo = ratings_train.build_full_trainset() # Build trainset
trainset_pred = trainset_algo.build_testset() # Build iterable object in order to test 

# C'est notre test set donc utiliser plus tard
ratings_test = formating_data_surprise(test_ratings)
validationset = ratings_test.build_full_trainset()
# -------------------------------------------------

########## Train and test the algo ###########
algorithm = [SVDpp(),KNNBaseline(),NMF(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]

prediction_df = dataF_train.copy()
for i, algo in enumerate (algorithm):
    
    algo.train(trainset_algo) # Training of the algo
    pred = algo.test(trainset_pred) # Make the prediction

    ########## Creation of the lists: row_users, col_movies, estim ########
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings
    
    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_df = pd.concat([prediction_df,temp], axis=1)

display(prediction_df.head())

In [None]:
########### Cleaning the dataframe ############
second_step_dataset_df = prediction_df.copy() # Copy 
second_step_dataset_df = second_step_dataset_df.drop(second_step_dataset_df.columns[[0, 2]], axis=1) # In order to keep only the real ratings and then the predictions for all algos
second_step_dataset_df.columns = ['Label','SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly', 'KNNWithZScore'] # TODO: should be adapted
display(second_step_dataset_df.head()) # In order to give an idea of the values

second_step_dataset_df.to_csv('predictions_allAlgos.csv') # ADD BY STEF: Save this as a CSV file, because it is long to run

In [None]:
################# Use the second_step_dataset.csv saved ################
##### ADD BY STEFAN ###
# This should only be run if we don't run Majority

second_step_dataset_df_imp = pd.read_csv('predictions_allAlgos.csv')
second_step_dataset_df = second_step_dataset_df_imp.copy()

second_step_dataset_df = second_step_dataset_df.drop(second_step_dataset_df.columns.values[0], axis = 1)
display(second_step_dataset_df.head())


In [None]:
### ADD stefan


regressions_method = [linear_model.LinearRegression, linear_model.Ridge, linear_model.Lasso, linear_model.BayesianRidge]
regressors_find, rmse_find = lin_regressors(second_step_dataset_df, regressions_method)

print(regressors_find, rmse_find)

In [None]:
########### Apply the classifier (SVM) ###############
##### ADD BY STEFAN ###
classifiers_method = [naive_bayes, kNearestNeigh, decision_tree, neural_net, support_vectorMachine, discr_analysis, lin_discr_analysis]
clf, test_error = apply_classifier(second_step_dataset_df, classifiers_method)

print(clf)

In [None]:
print(test_error)
print(clf)
bestCLF = clf

In [None]:
# ////// DELETE \\\\\\\\\\\\\
# Cleaning the dataframe
only_prediction_df = prediction_df['prediction'] # We only take the predictions of the different algorithms
only_prediction_df = only_prediction_df.round() # We round the predictions.
only_prediction_df.columns = ['SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly','KNNWithZScore'] # Change on column name
only_prediction_df['Majority'] = 0
#display(only_prediction_df.head())

In [None]:
for i ,row in only_prediction_df.iterrows(): # We iterate over all the raws to analyse each prediction and chose the
                                             # Majority of the prediction
        
    row_ = row.as_matrix() # Tranform row in np array
    unique, counts = np.unique(row_, return_counts=True) # We get set of ratings and their respective count repetition
    index_of_max = np.where(counts == np.max(counts)) # We select the index of the max count
    max_ = unique[index_of_max]
    if max_.shape[0]>1: # If there are same amount of max count, we chose the one with the highest score as we saw
                        # that the rating distribution is shifter upwards there is 
        max_ = max_[-1]
    only_prediction_df.loc[i,'Majority'] = max_

display(only_prediction_df.head())

In [None]:
only_prediction_df.to_csv('only_pred.csv')
prediction_df.to_csv('all_pred.csv')

# ---------- NEW -------------
Should be where there is everything done

In [None]:
#********* Option for algos ***************
sim_options = {'name': 'pearson_baseline',
               'user_based': False ,
               'shrinkage': 500
              }


#********* Define some variables **********
columns_name = ['SVDpp','KNNBaseline','NMF','SVD','SlopeOne','BaselineOnly','KNNWithZScore']
regressions_method = [linear_model.LinearRegression(), 
                      linear_model.Ridge(), 
                      linear_model.Lasso(), 
                      linear_model.BayesianRidge(),
                      linear_model.ElasticNet(),
                      linear_model.HuberRegressor(),
                      linear_model.LassoLars(),
                      linear_model.PassiveAggressiveRegressor(),
                      linear_model.SGDRegressor()
                     ]

classifiers_method = [naive_bayes,
                      kNearestNeigh, 
                      decision_tree, 
                      neural_net, 
                      support_vectorMachine, 
                      discr_analysis, 
                      lin_discr_analysis
                     ]
algos = [SVDpp(),
         KNNBaseline(sim_options=sim_options),
         NMF(),
         SVD(),
         SlopeOne(),
         BaselineOnly(),
         KNNWithZScore(sim_options=sim_options)
        ]

In [None]:
################ Training and Validating the model ################3
%reload_ext autoreload

In [None]:
# ============= TRAINING ====================
#********* Creation of train and validation ratings *********
train_ratings, validation_ratings =split_data(ratings, prob_test=0.15) # splitting in train and test set

#********* Do the training with all the algos ****************
print('-----START -----\nTraining of the algos\n')
prediction_df, algos_trained = first_train(train_ratings,algos) # Call function first_train
second_df, moviesID_userID_df = second_train_df(prediction_df, columns_name) # Call function second_train_df

In [None]:
# SAVE 
prediction_df.to_csv('prediction_df_training.csv')
second_df.to_csv('second_df_training.csv')
moviesID_userID_df.to_csv('moviesID_userID_df_training.csv')
np.save('algos_trained_training', algos_trained)

In [None]:
#********* Find the best classifier (with the lowest test error) **********
print('-----START -----\nSelection of classifier and regressor\n')
clf, best_test_error = apply_classifier(second_df, classifiers_method, ratio=0.80)


In [None]:
reg, best_rmse = lin_regressors(second_df, regressions_method, ratio = 0.80)

In [None]:
#******** Train the best classifier and the best regressor on the whole training set **********
training_prediction_set, training_prediction_label = get_label_predictions(second_df)
clf.fit(training_prediction_set, training_prediction_label)
reg.fit(training_prediction_set, training_prediction_label)

In [None]:
# SAVE
np.save('clf_training_wholeSet', clf)
np.save('reg_training_wholeSet', reg)

In [None]:
# ============ Validating ===================
print('-----START -----\nValidation set\n')
prediction_val_df, _algo = first_train(validation_ratings,algos_trained) # HERE SOMETHING IS NOT CORRECT, we should not train on the validation set
second_df_val, moviesID_userID_df = second_train_df(prediction_val_df, columns_name)

onlyPREDICTION = second_df_val.drop(second_df_val.columns[[0]], axis=1)

In [None]:
# SAVE
prediction_val_df.to_csv('prediction_val_df_validation.csv')
second_df_val.to_csv('second_df_val_validation.csv')
moviesID_userID_df.to_csv('moviesID_userID_df_validation.csv')
onlyPREDICTION.to_csv('onlyPREDICTION_validation.csv')

In [None]:
#******* End of validation **************
print('-----START -----\nPREDICTION by best classifier and best regressor\n')
y_prediction_clf = clf.predict(onlyPREDICTION)  # Do the classification
y_prediction_reg = onlyPREDICTION.dot(reg.coef_)
# Here we need to check 
RMSE_clf = np.sqrt(mean_squared_error(second_df_val['Label'], y_prediction_clf))
RMSE_reg = np.sqrt(mean_squared_error(second_df_val['Label'], y_prediction_reg))
print('RMSE with classifier:\t{}\nRMSE with regressor:\t{}'.format(RMSE_clf, RMSE_reg))

In [None]:
print(clf)
print(reg)

In [None]:
############### Creation of Submission ##############
prediction_test_df = first_train(ratings,algos, test,submit = True) # Training on the whole set  
train_df, moviesID_userID_df = second_train_df(prediction_test_df, columns_name) # Get the predictions from the algos

onlyPrediction = train_df.drop(train_df.columns[[0]], axis=1)

y_prediction_clf = clf.predict(onlyPrediction) # Apply the classifier
y_prediction_reg = onlyPrediction.dot(reg.coef_)
moviesID_userID_df['Prediction'] = y_prediction_clf # Add a prediction columns

######### Create the CSV files ##########
name = 'prediction_clf.csv' # Name of the file
create_csv_submission(moviesID_userID_df['users ID'], moviesID_userID_df['movies ID'], moviesID_userID_df['Prediction'], name) # To create the CSV file

name = 'prediction_reg.csv'
moviesID_userID_df['Prediction'] = y_prediction_reg
create_csv_submission(moviesID_userID_df['users ID'], moviesID_userID_df['movies ID'], moviesID_userID_df['Prediction'], name) # To create the CSV file


In [None]:
# SAVE
prediction_test_df.to_csv('prediction_test_df_createSub.csv')
train_df.to_csv('train_df_createSub.csv')
moviesID_userID_df.to_csv('moviesID_userID_df_createSub.csv')

# ---------- END of new -------------

# For submission

In [None]:
########### Define: algo, dataset (trainset ##############
ratings_ = formating_data_surprise(ratings)
trainset = ratings_.build_full_trainset()


In [None]:
########### Define: testset ##############
dataF_test_ratings_, test_ratings_ = formating_data_surprise(test, True)
test_trainset = test_ratings_.build_full_trainset()


In [None]:
testset = test_trainset.build_testset()

In [None]:
########## STEF ADD ###################
########## Train and test the algo ###########
algorithm = [SVDpp(),KNNBaseline(),NMF(),SVD(),SlopeOne(),BaselineOnly(),KNNWithZScore()]

prediction_df = dataF_test_ratings_.copy()
for i, algo in enumerate (algorithm):
    
    algo.train(trainset) # Training of the algo
    pred = algo.test(testset) # Make the prediction

    ########## Creation of the lists: row_users, col_movies, estim ########
    estim = [] # initialization of the list estim

    for p in pred: # To loop over the prediction done by the algo on the test set
        estim.append(p.est) # fill this list with the ratings
    
    d = {'prediction' : pd.Series(estim)}
    temp = pd.DataFrame(d)
    prediction_df = pd.concat([prediction_df,temp], axis=1)

In [None]:
######### STEF ADD ################
######### Create the submission #########
#******** Take the prediction of the algos ********
prediction_df_clean = prediction_df.copy() # copy in order to not modify the original 
prediction_df_clean = prediction_df_clean.drop(prediction_df_clean.columns.values[0:3], axis = 1) # remove the columns that we don't want

#******** Apply the classifier ***********
prediction_clas = clf.predict(prediction_df_clean) # Apply the classifier on the predictions

#******** Prepare the variables for submission *********
usersID = prediction_df['users ID'] # To have the user ID
moviesID = prediction_df['movies ID'] # To have the movies ID
name = 'all_algos_SVM_noOptimization.csv' # The name of the csv file
create_csv_submission(usersID, moviesID, prediction_clas, name)

In [None]:
########## Train and test the algo ###########

#n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.2

#algorithm = SVD(n_epochs = 20, lr_all = 0.002, reg_all = 0.2)
sim_options = {'name': 'pearson_baseline',
               'user_based': False ,
               'shrinkage': 500
              }
algorithm_sim = KNNBaseline(sim_options=sim_options)

algorithm_sim.train(trainset)

In [None]:
pred = algorithm_sim.test(testset)

In [None]:
########## Creation of the lists: row_users, col_movies, estim ########
row_users = [] # initialization of the list row_users
col_movies = [] # initialization of the list col_movies
estim = [] # initialization of the list estim
for p in pred: # To loop over the prediction done by the algo on the test set
    row_users.append(p.uid) # fill this list with the indices of the users
    col_movies.append(p.iid) # fill this list with the indices of the movies
    estim.append(p.est) # fill this list with the ratings


In [None]:
######### Create the CSV files ##########
name = 'KNNBaseline.csv' # Name of the file

#estim = only_prediction_df['Majority'] # This is only here in order to use the estimation done previously

#estim = prediction_df['prediction'].mean(axis = 1)
#print(estim.shape)
create_csv_submission(row_users, col_movies, estim, name) # To create the CSV file 
    

In [None]:
print(len(pred))

### Tests with different similarities

In [None]:
########### Define: algo, dataset (trainset ##############
ratings_bsl = formating_data_surprise(ratings)
#trainset_bsl = ratings_bsl.build_full_trainset()

In [None]:
ratings_bsl.split(n_folds=6)

In [None]:
algorithm1 = BaselineOnly()

for trainset1, testset1 in ratings_bsl.folds():

    # train and test algorithm.
    algorithm1.train(trainset1)
    predictions1 = algorithm1.test(testset1)

    # Compute and print Root Mean Squared Error
    rmse1 = accuracy.rmse(predictions1, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
perf = evaluate(algorithm1, ratings_bsl, measures=['RMSE'])

print_perf(perf)

In [None]:
bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_u': 10,
               'reg_i': 25
               }

In [None]:
algorithm_bsl = BaselineOnly(bsl_options=bsl_options)


for trainset1, testset1 in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_bsl.train(trainset1)
    predictions1 = algorithm_bsl.test(testset1)

    # Compute and print Root Mean Squared Error
    rmse_bsl = accuracy.rmse(predictions1, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
perf = evaluate(algorithm_bsl, ratings_bsl, measures=['RMSE'])

print_perf(perf)

In [None]:
### LONG

algorithm2 = KNNBaseline

for trainset2, testset2 in ratings_bsl.folds():

    # train and test algorithm.
    algorithm2.train(trainset2)
    predictions2 = algorithm2.test(testset2)

    # Compute and print Root Mean Squared Error
    rmse2 = accuracy.rmse(predictions2, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'user_based': False
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_item = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True ,
               'shrinkage': 0
               }

In [None]:
##### LONG

algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'user_based': False
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears_user = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False ,
               'shrinkage': 500
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears_user = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False 
              }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears_user = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'pearson',
               'user_based': False
               }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

In [None]:
sim_options = {'name': 'cosine',
               'user_based': False
               }

In [None]:
algorithm_sim = KNNBaseline(sim_options=sim_options)

for trainset_sim, testset_sim in ratings_bsl.folds():

    # train and test algorithm.
    algorithm_sim.train(trainset_sim)
    predictions_sim = algorithm_sim.test(testset_sim)

    # Compute and print Root Mean Squared Error
    rmse_sim_pears = accuracy.rmse(predictions_sim, verbose=True)
    break

# Evaluate performances of our algorithm on the dataset.

# DON'T CARE

### Question

   - Il est normal de mettre un threshold (min_num_ratings) parce qu'on veut un certain nombre de données pour ce prononcer quand à donner une note, ceci implique qu'on enlève des users et des items, du coup notre matrice des ratings va être plus petite. Quand on la remplis et on fait un submit, comment est-ce qu'on gére ça?
       - Ici je suppose que c'est bien de faire la selection des ratings pour le train et test --> comme ceci on est pas ou moins biaisé par les movies et users qui n'ont que des 0 et qui donc n'apporte rien apart du "bruit"
       - Pour le remplissage de ce que l'on doit submit j'ai fait une petite comparaison entre les data que l'on nous donne et ce qui se trouve dans le sample_submission, (enfin je vais le faire) 

In [None]:
#********** Pre-processing **********
min_num_ratings = 10 # or 15 this is based on the information given above
ratings_valid = valid_ratings(ratings, num_items_per_user, num_users_per_item,min_num_ratings)
train, test = split_data(ratings_valid) # This will put 90% of the items for the users that have at least one non-zero entry and the 10% in test


In [None]:
cross_validation_application(train ,False ,4,0.01, 20, 0.2, 0.3,1)

In [None]:
#********* Try ************
#rmse = rmse_movie_mean(train, test)
rmse = matrix_factorization_SGD(train, test)

In [None]:
rmse