# import data and lib

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, pairwise_distances
from sklearn.model_selection import train_test_split


In [3]:
#read rating file
ratings=pd.read_csv('train.csv')

#article info
article_info=pd.read_csv('article_info.csv')

#test file
test=pd.read_csv('test.csv')

In [4]:
article_info.head()

Unnamed: 0,article_id,website,title,content
0,1025,uxmovement,Comment concevoir une procédure pas à pas que ...,par anthony le 18/07/16 à 8h02 Si une nouvelle...
1,2328,endeavor,Ressources humaines? Seulement si vous optez p...,"«Ambassadeurs», «avocats», «porte-parole» d'un..."
2,2469,linkedin,Deux motions de vente différentes. . . .,J'ai passé pas mal de temps récemment avec des...
3,2590,googleblog,Apprentissage large et profond: mieux avec Ten...,"""Apprenez les règles comme un pro, afin de pou..."
4,697,infoq,Agile: manque de compétences en tests,"Fran O'Hara, directeur et consultant principal..."


In [5]:
test

Unnamed: 0,user_id,article_id
0,1,2607
1,1,1445
2,1,911
3,1,857
4,1,2062
...,...,...
7238,1087,2089
7239,1087,504
7240,1087,1801
7241,1087,967


In [6]:
ratings.head()

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1


# Merge data set for organised data

In [7]:
ratings=ratings.merge(article_info[['article_id','title']],how='left',left_on='article_id',right_on='article_id')

In [8]:
ratings

Unnamed: 0,user_id,article_id,rating,title
0,1,456,1,"Obtenez 6 mois d'accès à Pluralsight, la plus ..."
1,1,2934,1,La plateforme cloud de Google est désormais un...
2,1,82,1,La technologie derrière les photos d'aperçu
3,1,1365,1,Les VM préemptives de Google Cloud Platform so...
4,1,221,1,Ray Kurzweil: Le monde ne se détériore pas - n...
...,...,...,...,...
16726,1087,2242,1,Optimiser l'utilisation de la mémoire Arduino
16727,1087,419,1,Le plan national de l'Internet des objets sort...
16728,1087,784,1,Nintendo sortira Mini NES avec 30 jeux préchar...
16729,1087,1249,1,Coût d'erreur - Cinq raisons d'investir dans l...


In [9]:
ratings['rating'].max()

5

In [10]:
ratings.columns

Index(['user_id', 'article_id', 'rating', 'title'], dtype='object')

In [11]:
#ratings['article']=ratings['article_id'].map(str)+str(':')+ratings['title'].map(str)

In [12]:
ratings=ratings.drop(['title'],axis=1)

In [13]:
ratings

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1
...,...,...,...
16726,1087,2242,1
16727,1087,419,1
16728,1087,784,1
16729,1087,1249,1


# Creating train,test and split

In [14]:
#assisgn X as original dataset: to prevent any change in original dataset
X=ratings.copy()
X_train, X_valid=train_test_split(X,test_size=0.25,random_state=50)

In [15]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Create baseline using average of all ratings

In [16]:
#Define the baseline model to always return average of all available ratings
def baseline(user_id,article_id):
    return X_train['rating'].mean()

In [36]:
#Function to compute the RMSE score obtained on the test set by a model
def rmse_score(model):
    
    #Construct a list of user-movie tuples from the test dataset
    id_pairs = zip(X_valid['user_id'], X_valid['article_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([model(user, article_id) for (user, article_id) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_valid['rating'])
    
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)


In [37]:
rmse_score(baseline)

0.97935326982758

In [38]:
#item based using simple mean
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='article_id')

r_matrix.head()

article_id,3,4,6,7,8,9,10,11,12,13,...,2962,2963,2964,2965,2966,2968,2969,2970,2974,2976
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [39]:
#Item Based Collaborative Filter using Mean Ratings
def cf_item_mean(user_id, article_id):
        
    #Compute the mean of all the ratings given by the user
    mean_rating = r_matrix.loc[user_id].mean()

    return mean_rating

In [40]:
#Compute RMSE for the Mean model
rmse_score(cf_item_mean)

0.9198258309893801

In [41]:
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans

In [42]:
#surprise lib understnds the data in defined format****IMP***
reader=Reader(rating_scale=(1,5))
data=Dataset.load_from_df(X_train[['user_id','article_id','rating']],reader)

In [43]:
#surprise lib understnds the data in defined format****IMP***
reader=Reader(rating_scale=(1,5))
data=Dataset.load_from_df(X_train[['user_id','article_id','rating']],reader)

# Grid search for parameter tuning

In [50]:
#Defining the parameter grid with k as the neighbourhood size & trying 2 similarity measures KNNwithMeans
#& 5 folds, we also use user_based as True and false to try both user based and item based collaborative filtering
#and check which performs better
param_grid = {"k":list(range(1,50,5)),
              "sim_options":{"name":["cosine"],'user_based': [False]}}

#Trying to find the best set of hyperparameters using Grid Search
gs = GridSearchCV(KNNWithMeans, 
                  param_grid, 
                  measures=['rmse'], 
                  cv=5, 
                  n_jobs = -1)

#We fit the grid search on data to find out the best score
gs.fit(data)

#Printing the best score
print(gs.best_score['rmse'])

#Printing the best set of parameters
print(gs.best_params['rmse'])

1.1077882293306536
{'k': 31, 'sim_options': {'name': 'cosine', 'user_based': False}}


# model fitment itemwise

In [51]:
#Defining similarity measure as per the best parameters
sim_options = {'name': 'cosine', 'user_based': False}

#Fitting the model on train data
model = KNNWithMeans(k = 31, sim_options = sim_options)

#Build full trainset will essentially fits the knnwithmeans on the complete train set instead of a part of it
#like we do in cross validation
model.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fa438aee280>

In [54]:
#id pairs for test set
id_pairs = zip(X_valid['user_id'], X_valid['article_id'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

#Actual rating values for test set
y_true = X_valid['rating']

# Checking performance on test set
rmse(y_true, y_pred)

1.1102217323242622

In [55]:
#prediction using test dataset
#id pairs for test set
id_pairs = zip(test['user_id'], test['article_id'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

In [56]:
y_pred=pd.DataFrame({"ratings_test":y_pred})

In [57]:
y_pred.to_csv('solution_2_itemwise.csv',index=False)