# import data and lib

In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, pairwise_distances
from sklearn.model_selection import train_test_split


In [30]:
#read rating file
ratings=pd.read_csv('train.csv')

#article info
article_info=pd.read_csv('article_info.csv')

#test file
test=pd.read_csv('test.csv')

In [31]:
article_info.head()

Unnamed: 0,article_id,website,title,content
0,1025,uxmovement,Comment concevoir une procédure pas à pas que ...,par anthony le 18/07/16 à 8h02 Si une nouvelle...
1,2328,endeavor,Ressources humaines? Seulement si vous optez p...,"«Ambassadeurs», «avocats», «porte-parole» d'un..."
2,2469,linkedin,Deux motions de vente différentes. . . .,J'ai passé pas mal de temps récemment avec des...
3,2590,googleblog,Apprentissage large et profond: mieux avec Ten...,"""Apprenez les règles comme un pro, afin de pou..."
4,697,infoq,Agile: manque de compétences en tests,"Fran O'Hara, directeur et consultant principal..."


In [91]:
test

Unnamed: 0,user_id,article_id
0,1,2607
1,1,1445
2,1,911
3,1,857
4,1,2062
...,...,...
7238,1087,2089
7239,1087,504
7240,1087,1801
7241,1087,967


In [34]:
ratings.head()

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1


# Merge data set for organised data

In [35]:
ratings=ratings.merge(article_info[['article_id','title']],how='left',left_on='article_id',right_on='article_id')

In [36]:
ratings

Unnamed: 0,user_id,article_id,rating,title
0,1,456,1,"Obtenez 6 mois d'accès à Pluralsight, la plus ..."
1,1,2934,1,La plateforme cloud de Google est désormais un...
2,1,82,1,La technologie derrière les photos d'aperçu
3,1,1365,1,Les VM préemptives de Google Cloud Platform so...
4,1,221,1,Ray Kurzweil: Le monde ne se détériore pas - n...
...,...,...,...,...
16726,1087,2242,1,Optimiser l'utilisation de la mémoire Arduino
16727,1087,419,1,Le plan national de l'Internet des objets sort...
16728,1087,784,1,Nintendo sortira Mini NES avec 30 jeux préchar...
16729,1087,1249,1,Coût d'erreur - Cinq raisons d'investir dans l...


In [47]:
ratings['rating'].max()

5

In [37]:
ratings.columns

Index(['user_id', 'article_id', 'rating', 'title'], dtype='object')

In [38]:
#ratings['article']=ratings['article_id'].map(str)+str(':')+ratings['title'].map(str)

In [39]:
ratings=ratings.drop(['title'],axis=1)

In [40]:
ratings

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1
...,...,...,...
16726,1087,2242,1
16727,1087,419,1
16728,1087,784,1
16729,1087,1249,1


# Creating train,test and split

In [42]:
#assisgn X as original dataset: to prevent any change in original dataset
X=ratings.copy()
X_train, X_valid=train_test_split(X,test_size=0.25,random_state=50)

In [62]:
X_valid

Unnamed: 0,user_id,article_id,rating
10236,647,319,1
5272,383,944,1
15689,1011,1433,1
15308,989,2645,1
9228,584,2094,1
...,...,...,...
12103,749,1308,1
4574,330,110,1
8003,522,1011,1
9768,616,17,1


In [45]:
#function to calculate RMSE for fast execution
def rmse(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

# Using surprise library for prediction

In [51]:
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans

In [55]:
#surprise lib understnds the data in defined format****IMP***
reader=Reader(rating_scale=(1,5))
data=Dataset.load_from_df(X_train[['user_id','article_id','rating']],reader)

# Grid search for the hyperparameter tuning

In [57]:
param_grid={"k":list(range(1,70,5)),"sim_options":{"name":["cosine","pearson"]}}

gs = GridSearchCV(KNNWithMeans, 
                  param_grid, 
                  measures=['rmse'], 
                  cv=5, 
                  n_jobs = -1)

gs.fit(data)
#Printing the best score
print(gs.best_score['rmse'])

#Printing the best set of parameters
print(gs.best_params['rmse'])


0.9982913285260275
{'k': 6, 'sim_options': {'name': 'pearson', 'user_based': True}}


# Fit the model 

In [58]:
#Defining similarity measure as per the best parameters
sim_options = {'name': 'pearson'}

#Fitting the model on train data
model = KNNWithMeans(k = 46, sim_options = sim_options)

#Build full trainset will essentially fit the knnwithmeans on the complete train set instead of a part of it
#like we do in cross validation
model.fit(data.build_full_trainset())

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fc911355850>

In [61]:
#id pairs for test set
id_pairs = zip(X_valid['user_id'], X_valid['article_id'])

#Making predictions for test set using predict method from Surprise
[model.predict(uid = user, iid = article_id) for (user, article_id) in id_pairs]

[Prediction(uid=647, iid=319, r_ui=None, est=1.1904761904761905, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid=383, iid=944, r_ui=None, est=1.05, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid=1011, iid=1433, r_ui=None, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=989, iid=2645, r_ui=None, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=584, iid=2094, r_ui=None, est=1.2142857142857142, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid=978, iid=1700, r_ui=None, est=1.0588235294117647, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid=397, iid=122, r_ui=None, est=2.119047619047619, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid=408, iid=1536, r_ui=None, est=1, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid=399, iid=979, r_ui=None, est=2.1818181818181817, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid=

In [101]:
#id pairs for test set
id_pairs = zip(X_valid['user_id'], X_valid['article_id'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

#Actual rating values for test set
y_true = X_valid['rating']

# Checking performance on test set
rmse(y_true, y_pred)

1.0156959255885387

In [102]:
#prediction using test dataset
#id pairs for test set
id_pairs = zip(test['user_id'], test['article_id'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

In [103]:
y_pred=pd.DataFrame({"ratings_test":y_pred})

In [105]:
y_pred

Unnamed: 0,ratings_test
0,1.000000
1,1.793744
2,1.363636
3,1.363636
4,1.000000
...,...
7238,1.375000
7239,1.375000
7240,1.375000
7241,1.375000


In [107]:
test.dtypes

user_id       int64
article_id    int64
dtype: object

In [112]:
y_pred.to_csv('solution_1.csv',index=False)