# import data and lib

In [59]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, pairwise_distances
from sklearn.model_selection import train_test_split


In [60]:
#read rating file
ratings=pd.read_csv('train.csv')

#article info
article_info=pd.read_csv('article_info.csv')

#test file
test=pd.read_csv('test.csv')

In [61]:
article_info.head()

Unnamed: 0,article_id,website,title,content
0,1025,uxmovement,Comment concevoir une procédure pas à pas que ...,par anthony le 18/07/16 à 8h02 Si une nouvelle...
1,2328,endeavor,Ressources humaines? Seulement si vous optez p...,"«Ambassadeurs», «avocats», «porte-parole» d'un..."
2,2469,linkedin,Deux motions de vente différentes. . . .,J'ai passé pas mal de temps récemment avec des...
3,2590,googleblog,Apprentissage large et profond: mieux avec Ten...,"""Apprenez les règles comme un pro, afin de pou..."
4,697,infoq,Agile: manque de compétences en tests,"Fran O'Hara, directeur et consultant principal..."


In [62]:
test

Unnamed: 0,user_id,article_id
0,1,2607
1,1,1445
2,1,911
3,1,857
4,1,2062
...,...,...
7238,1087,2089
7239,1087,504
7240,1087,1801
7241,1087,967


In [63]:
ratings.head()

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1


# Merge data set for organised data

In [64]:
ratings=ratings.merge(article_info[['article_id','title']],how='left',left_on='article_id',right_on='article_id')

In [65]:
ratings

Unnamed: 0,user_id,article_id,rating,title
0,1,456,1,"Obtenez 6 mois d'accès à Pluralsight, la plus ..."
1,1,2934,1,La plateforme cloud de Google est désormais un...
2,1,82,1,La technologie derrière les photos d'aperçu
3,1,1365,1,Les VM préemptives de Google Cloud Platform so...
4,1,221,1,Ray Kurzweil: Le monde ne se détériore pas - n...
...,...,...,...,...
16726,1087,2242,1,Optimiser l'utilisation de la mémoire Arduino
16727,1087,419,1,Le plan national de l'Internet des objets sort...
16728,1087,784,1,Nintendo sortira Mini NES avec 30 jeux préchar...
16729,1087,1249,1,Coût d'erreur - Cinq raisons d'investir dans l...


In [66]:
ratings['rating'].max()

5

In [67]:
ratings.columns

Index(['user_id', 'article_id', 'rating', 'title'], dtype='object')

In [68]:
#ratings['article']=ratings['article_id'].map(str)+str(':')+ratings['title'].map(str)

In [69]:
ratings=ratings.drop(['title'],axis=1)

In [70]:
ratings

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1
...,...,...,...
16726,1087,2242,1
16727,1087,419,1
16728,1087,784,1
16729,1087,1249,1


# Creating train,test and split

In [71]:
#assisgn X as original dataset: to prevent any change in original dataset
X=ratings.copy()
X_train, X_valid=train_test_split(X,test_size=0.25,random_state=50)

In [72]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Surprise lib for prediction

In [73]:
#Importing functions to be used in this notebook from Surprise Package
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

In [74]:
#Reader object to import ratings from X_train
reader = Reader(rating_scale=(1, 5))

#Storing Data in surprise format from X_train
data = Dataset.load_from_df(X_train[['user_id','article_id','rating']], reader)

In [75]:
# Train a new SVD with 100 latent features (number was chosen arbitrarily)
model = SVD(n_factors=100)

#Build full trainset will essentially fits the knnwithmeans on the complete train set instead of a part of it
#like we do in cross validation
model.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa438ae73a0>

In [78]:
#id pairs for test set
id_pairs = zip(X_valid['user_id'], X_valid['article_id'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

#Actual rating values for test set
y_true = X_valid['rating']

# Checking performance on test set
rmse(y_true, y_pred)

0.9333799973517466

# Using grid search for parameter tuning

In [83]:
#Defining the parameter grid for SVD and fixing the random state
param_grid = {'n_factors':list(range(1,50,5)), 'n_epochs': [5, 10, 20], 'random_state': [42]}

#Defining the grid search with the parameter grid and SVD algorithm optimizing for RMSE
gs = GridSearchCV(SVD, 
                  param_grid, 
                  measures=['rmse'], 
                  cv=5, 
                  n_jobs = -1)

#Fitting the mo
gs.fit(data)
 
#Printing the best score
print(gs.best_score['rmse'])

#Printing the best set of parameters
print(gs.best_params['rmse'])

0.9360176195022314
{'n_factors': 6, 'n_epochs': 20, 'random_state': 42}


In [84]:
#Fitting the model on train data with the best parameters
model = SVD(n_factors = 6, n_epochs = 20, random_state = 42)

#Build full trainset will essentially fits the SVD on the complete train set instead of a part of it
#like we do in cross validation for grid search
model.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa41fb4adf0>

In [86]:
#id pairs for test set

id_pairs = zip(X_valid['user_id'], X_valid['article_id'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = article)[3] for (user, article) in id_pairs]

#Actual rating values for test set
y_true = X_valid['rating']

# Checking performance on test set
rmse(y_true, y_pred)

0.9295266231040932

In [87]:
#prediction using test dataset
#id pairs for test set
id_pairs = zip(test['user_id'], test['article_id'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

In [88]:
y_pred=pd.DataFrame({"ratings_test":y_pred})

In [89]:
y_pred.to_csv('solution_SVD.csv',index=False)