In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.io.json import json_normalize
import os
import json
import re

# If the import below is not working, you might need to install uszipcode with pip first
#import sys
#!{sys.executable} -m pip install uszipcode 

from uszipcode import SearchEngine
from scipy.spatial.distance import cosine 
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

On load les donnés de la comptétition

In [None]:
movie_df = pd. read_csv('../input/movierecommendationcompetition/movies.dat', sep='::', header=None, engine='python')
movie_df.rename(index=str, columns={0: "movieId", 1: "title", 2: "genre"}, inplace=True)
movie_df.dropna(inplace=True)
movie_df = movie_df.drop_duplicates(keep='first') # La doc dit qu'il y a des dupliques par movieId

In [None]:
user_df = pd. read_csv('../input/movierecommendationcompetition/users.dat', sep='::', header=None, engine='python')
user_df.rename(index=str, columns={0: "userId", 1: "gender", 2: "age", 3: "occupation", 4: "zipcode"}, inplace=True)
user_df.dropna(inplace=True)

In [None]:
rating_df = pd. read_csv('../input/movierecommendationcompetitionrating/training_ratings_for_kaggle_comp.csv')
rating_df.rename(index=str, columns={"user": "userId", "movie": "movieId"}, inplace=True)
rating_df.drop(["id"], axis=1, inplace=True)
rating_df.dropna(inplace=True)

On crée les nouvelles colonnes dans le dataset d'utilisateur

In [None]:
age_labels = { 1 : "Under 18", 18 : "18-24", 25 : "25-34", 35 : "35-44", 45: "45-49", 50 : "50-55", 56 : "56+" }

user_df["age_desc"] = user_df["age"]
user_df["age_desc"].replace(age_labels, inplace=True)

occupation_labels = {0:  "other or not specified",1:  "academic/educator",2:  "artist",3:  "clerical/admin",
                     4:  "college/grad student", 5:  "customer service",6:  "doctor/health care",
                     7:  "executive/managerial",8:  "farmer",9:  "homemaker", 10:  "K-12 student",
                     11:  "lawyer",12:  "programmer",13:  "retired",14:  "sales/marketing",15:  "scientist",
                     16:  "self-employed",17:  "technician/engineer",18:  "tradesman/craftsman",
                     19:  "unemployed",20:  "writer" }

user_df["occupation_desc"] = user_df["occupation"]
user_df["occupation_desc"].replace(occupation_labels, inplace=True)
    
gender_labels = { 'F' : "Woman", "M" : "Man"}
    
user_df["gender_desc"] = user_df["gender"]
user_df["gender_desc"].replace(gender_labels, inplace=True)

gender_labels = { 'F' : "1", "M" : "2"}
    
user_df["gender_nr"] = user_df["gender"]
user_df["gender_nr"].replace(gender_labels, inplace=True)
user_df["gender_nr"] = user_df["gender_nr"].astype(float)

In [None]:
search = SearchEngine(simple_zipcode=True) # This is long because it downloads 9M of data... it breaks Kaggle sometimes

def get_zipcode_detail(row):                                                  
    zipcode = search.by_zipcode(row["zipcode"])
    return zipcode.major_city, zipcode.post_office_city, zipcode.county, zipcode.state

user_df['major_city'], user_df['post_office_city'], user_df["county"], user_df["state"]  = zip(*user_df.apply(get_zipcode_detail, axis=1))

In [None]:
gender_labels = { 'F' : "1", "M" : "2"}
    
user_df["gender_nr"] = user_df["gender"]
user_df["gender_nr"].replace(gender_labels, inplace=True)
user_df["gender_nr"] = user_df["gender_nr"].astype(float)

In [None]:
user_df.head()

On crée les nouvelles colonnes dans le dataset des films

In [None]:
movie_df["year"] = movie_df.title.str.extract(r'\(([^)]*)\)[^(]*$', expand=True)
movie_df["title"].replace(regex=True,inplace=True,to_replace=r'\(([^)]*)\)[^(]*$',value=r'')
movie_df['title'] = movie_df['title'].str.strip()

In [None]:
genres = ["Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]
for genre in genres:
    genre_name = genre.replace("'", "").replace("-","_")
    movie_df[genre_name] = [True if genre in x else False for x in movie_df['genre']]

In [None]:
movie_df["second_title"] = movie_df.title.str.extract(r'(\(.*\))?$', expand=True)
movie_df["title"].replace(regex=True,inplace=True,to_replace=r'(\(.*\))?$',value=r'')

In [None]:
def fix_title(text):
    texts = text.split(', ')
    if (len(texts) == 1):
        return texts[0]
    else:
        return texts[1] + " " + texts[0]

movie_df['title'] = movie_df['title'].apply(lambda x: fix_title(x))

movie_df.head(10)

On merge le tout ensemble

In [None]:
df = pd.merge(movie_df, rating_df, on='movieId')
df = pd.merge(df, user_df, on='userId')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### Système de recommendation simple:

Premiere chose, on calcule la moyenne/quantité de vote par filme

In [None]:
average_df = df.groupby('movieId') \
       .agg({'title':'size', 'rating':'mean'}) \
       .rename(columns={'title':'count','rating':'mean_rating'}) \
       .reset_index() \
       
detail_average_df = pd.merge(movie_df, average_df, on='movieId')
detail_average_df.drop(['genre', 'movieId'], axis=1, inplace=True)

detail_average_df.sort_values(by='count', ascending=False).head(10)

Si on veut utiliser le calcul de score IMDB, faut calculer les valeurs suivantes:

In [None]:
C = detail_average_df['mean_rating'].mean()
M = detail_average_df['count'].quantile(0.9)
print (C)
print (M)

Filtrer les filmes qui sont dans le quantile 0.9 (evite ceux qu'on moins que 387 votes)

In [None]:
q_movies = detail_average_df.copy().loc[average_df['count'] >= M]
q_movies.shape

In [None]:
def weighted_rating(x, m=M, C=C):
    v = x['count']
    R = x['mean_rating']
    # Calculation based on the IMDB formula
    return (v/(v+M) * R) + (M/(M+v) * C)

q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies.sort_values(by='score', ascending=False).head(10)

Avec ce tableau, c'est possible de filtrer le top film par categorie:

In [None]:
top_by_drama = q_movies.loc[q_movies['Drama'] == True]
top_by_drama[['title', 'score']].sort_values(by='score', ascending=False).head(5)

Example avec action:

In [None]:
top_by_drama = q_movies.loc[q_movies['Action'] == True]
top_by_drama[['title', 'score']].sort_values(by='score', ascending=False).head(5)

On meme, plusieurs conditions:

In [None]:
top_by_adventure_1999 = q_movies.loc[(q_movies['Action'] == True) & (q_movies['year'] == "1999")]
top_by_adventure_1999[['title', 'score']].sort_values(by='score', ascending=False).head(5)

### Un système par similarité entre utilisateurs

On crée une matrice sparce entre movieId/UserId et la colonne comme valeur

In [None]:
rp = rating_df.pivot_table(columns=['movieId'], index=['userId'], values='rating') 
rp = rp.fillna(0) 

rp_mat = rp.as_matrix() # Plus performant
rp.head()

Brute force algorithme pour créer una matrice de correlation avec cosine avec la matrice sparce (c'est trés long)

In [None]:
users_file = Path("pd_users.csv")
if users_file.exists():
    pd_users = pd.read_csv("pd_users.csv", index_col='userId') 
else: 
    mat_users = cosine_similarity(rp_mat)
    pd_users = pd.DataFrame(mat_users, index=rp.index, columns=rp.index) 
    
    # tres, tres, tres lent
    #m, n = rp.shape
    #mat_users = np.zeros((m, m)) 
    #for i in range(m): 
    #    for j in range(m): 
    #        if i != j: 
    #            mat_users[i][j] = (1- cosine(rp_mat[i,:], rp_mat[j,:])) 
    #        else: 
    #            mat_users[i][j] = 0. 
    #pd_users = pd.DataFrame(mat_users, index =rp.index, columns=rp.index) 
    
    pd_users.to_csv('pd_users.csv',sep=',') 

In [None]:
pd_users.head()

In [None]:
# Finding similar users 
def topn_simusers(userId, n=10): 
    try:
        users = pd_users.loc[userId,:].sort_values(ascending = False) 
        topn_users = users.iloc[:n,] 
        topn_users = topn_users.rename('score')     
        return pd.DataFrame(topn_users) 
    except KeyError:
        return pd.DataFrame()

userId = 2785
print ("Similar users as user:", userId)     
print (topn_simusers(userId=userId, n=10))

In [None]:
# Not found user = empty dataframe
print (topn_simusers(userId=1, n=10))

In [None]:
def topn_movieratings(userId, n_ratings=10):     
    uid_ratings = rating_df.loc[rating_df['userId']==userId] 
    uid_ratings = uid_ratings.sort_values(by='rating',ascending = [False]) 
    ratings = uid_ratings.iloc[:n_ratings,]     
    new_ratings = pd.merge(movie_df, ratings, on='movieId')
    return new_ratings[['userId', 'movieId', 'rating', 'title']]

userId = 2783
print ("Top movie ratings of user:",userId) 
print (topn_movieratings(userId=userId,n_ratings=10)) 

### Un système par similarité entre filmes

Similaire à la matrice d'utilisateurs

In [None]:
rp_m = rating_df.pivot_table(columns=['userId'], index=['movieId'], values='rating') 
rp_m = rp_m.fillna(0) 

rp_mat_m = rp_m.as_matrix() # Plus performant
rp_m.head()

In [None]:
movies_file = Path("pd_movies.csv")
if movies_file.exists():
    pd_movies = pd.read_csv("pd_movies.csv", index_col='movieId') 
else: 
    mat_movies = cosine_similarity(rp_mat_m)
    pd_movies = pd.DataFrame(mat_movies, index=rp_m.index, columns=rp_m.index) 
    pd_movies.to_csv('pd_movies.csv',sep=',') 

In [None]:
pd_movies.head()

In [None]:
# Finding similar movies 
def topn_simovies(movieId, n=10): 
    mid_ratings = pd_movies.loc[movieId,:].sort_values(ascending = False) 
    topn_movies = pd.DataFrame(mid_ratings.iloc[:n,]) 
    topn_movies['index1'] = topn_movies.index 
    topn_movies['index1'] = topn_movies['index1'].astype('int64') 
    topn_movies = pd.merge(topn_movies, movie_df[['movieId','title']],how = 'left', left_on ='index1' ,right_on = 'movieId') 
    del topn_movies['index1'] 
    
    topn_movies.rename(columns={ topn_movies.columns[0]: "score" }, inplace=True)
    topn_movies['score'] = topn_movies['score'].apply(lambda x: x*5)

    return topn_movies.iloc[1:] # Remove the first because it's himself
 
movieId = 2
print ("Movies similar to", movie_df['title'][movie_df['movieId'] == movieId].to_string(index=False)) 
print (topn_simovies(movieId=movieId,n=15)) 

### Similarité basé dans contenu (genre)

TfidfVectorizizer est un bon conversor de text en valeurs pour facilier comparaison. 
On n`a malheurement pas beaucoup de données en texte dans notre matrice, mais on essaye avec le genre

In [None]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
tfidf_matrix = tfidf.fit_transform(movie_df['genre'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

In [None]:
def get_recommendations(movieId, cosine_sim=cosine_sim):
    sim_scores = list(enumerate(cosine_sim[movieId]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return movie_df['title'].iloc[movie_indices]

movieId = 1
print ("Movies similar to movie ", movie_df['title'][movie_df['movieId'] == movieId].to_string(index=False),",are") 
get_recommendations(movieId)

Pas trés bon comme recommendation, ne nous sers pas

### Collaborative filtering 
#### Statistics for machine learning example

In [None]:
rp = rating_df.pivot_table(columns = ['movieId'],index = ['userId'],values = 'rating') 
rp = rp.fillna(0) 
A = rp.values

In [None]:
W = A>0.5 
W[W==True]=1 
W[W==False]=0 
W = W.astype(np.float64,copy=False) 

In [None]:
W_pred = A<0.5 
W_pred[W_pred==True]=1 
W_pred[W_pred==False]=0 
W_pred = W_pred.astype(np.float64,copy=False) 
np.fill_diagonal(W_pred,val=0) 

In [None]:
# Parameters 
m,n = A.shape 
n_iterations = 200 
n_factors = 100 
lmbda = 0.1 

In [None]:
X = 5 * np.random.rand(m,n_factors) 
Y = 5* np.random.rand(n_factors,n) 

In [None]:
def get_error(A, X, Y, W):
    return np.sqrt(np.sum((W * (A - np.dot(X, Y)))**2)/np.sum(W))

In [None]:
errors = [] 
for itr in range(n_iterations): 
    X = np.linalg.solve(np.dot(Y,Y.T)+ lmbda * np.eye(n_factors),np.dot(Y,A.T)).T 
    Y = np.linalg.solve(np.dot(X.T,X)+ lmbda * np.eye(n_factors),np.dot(X.T,A)) 
    if itr%10 == 0: 
        print(itr," iterations completed","RMSError value is:",get_error(A,X,Y,W)) 
    errors.append(get_error(A,X,Y,W)) 

In [None]:
A_hat = np.dot(X,Y) 

In [None]:
pred_recos = A_hat*W_pred 
pd_predrecos = pd.DataFrame(pred_recos,index =rp.index ,columns= rp.columns ) 
pd_predrecos.head()

In [None]:
def collab_recommovies(userId=315, n_movies=15, pred_mat=A_hat, wpred_mat=W_pred ): 
    try:
        pred_recos = pred_mat*wpred_mat 
        pd_predrecos = pd.DataFrame(pred_recos,index =rp.index ,columns= rp.columns ) 

        pred_ratings = pd_predrecos.loc[userId,:].sort_values(ascending = False) 
        pred_topratings = pred_ratings[:n_movies,] 
        pred_topratings = pred_topratings.rename('score')   
        pred_topratings = pd.DataFrame(pred_topratings) 
        pred_topratings['index1'] = pred_topratings.index 
        pred_topratings['index1'] = pred_topratings['index1'].astype('int64') 
        pred_topratings = pd.merge(pred_topratings,movie_df[['movieId','title']],how = 'left',left_on ='index1' ,right_on = 'movieId') 
        del pred_topratings['index1']     
        return pred_topratings 
    except KeyError:
        return pd.DataFrame()
    
userId = 2783
print ("\nTop movies predicted for the user:",userId,"based on collaborative filtering\n") 
predmtrx = collab_recommovies(userId=userId,n_movies=10,pred_mat=A_hat,wpred_mat=W_pred) 
print (predmtrx)

### Hybrid 

In [None]:
def hybrid(userId, movieId, removeAlreadySeen=False):
    similar_movies = topn_simovies(movieId, 10000)
    #similar_users = topn_simusers(userId, 10000)
    similar_users = collab_recommovies(userId, 10000, pred_mat=A_hat, wpred_mat=W_pred)
    
    # Si on trouve aucune film similaire et aucun user similaire, return le top
    if similar_users.empty and similar_movies.empty:
        resultat = q_movies.sort_values(by='score', ascending=False).head(10)[['title', 'score']]
    
    # Si on trouve des utilisateurs similaires, mais pas filmes similaires, return les recommendations systeme collaborative
    elif not similar_users.empty and similar_movies.empty:
        resultat = similar_users.iloc[0]
    
    # Si on trouve des filmes similaires, mais pas d'utilisateurs similaires, return les recommendations de filmes
    elif not similar_movies.empty and similar_users.empty:
        resultat = similar_movies[['movieId','title']]
    
    # Si on trouve les deux similaires, merge les deux, fait la moyenne du score
    else:
        merged_result = pd.merge(similar_movies, similar_users, on='movieId', how='outer').fillna(0)
        merged_result['score'] = merged_result.apply(lambda row: ((row.score_x * 2) + (row.score_y * 0.5)) / 2, axis=1)
        merged_result.drop(["score_x", "score_y", "title_y"], axis=1, inplace=True)
        merged_result.rename(columns={ 'title_x': "title" }, inplace=True)

        resultat = merged_result.sort_values(by='score', ascending=False).head(10)
    
    if removeAlreadySeen:
        already_seen_df = rating_df.loc[rating_df['userId'] == userId]
        already_seen_df_s = already_seen_df[already_seen_df.columns[1]]
        
        return resultat[~resultat['movieId'].isin(already_seen_df_s)]
    else:
        return resultat

In [None]:
movieId = 1
userId = 2783
movieName = movie_df['title'][movie_df['movieId'] == movieId].to_string(index=False)
print ("Movie recommendation for user", userId,"similar to movie `", movieName ,"` are:") 
hybrid(userId=userId, movieId=1)

In [None]:
movieId = 1
userId = 2783
movieName = movie_df['title'][movie_df['movieId'] == movieId].to_string(index=False)
print ("Movie recommendation for user", userId,"similar to movie `", movieName ,"` are:") 
hybrid(userId=userId, movieId=1, removeAlreadySeen=True)

### Collaborative with "surprise" library

In [None]:
import sys
# !{sys.executable} -m pip install scipy==1.2.1
# !{sys.executable} -m pip install surprise

In [None]:
from surprise import Reader, Dataset, SVD, evaluate

In [None]:
reader = Reader()
data = Dataset.load_from_df(rating_df[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [None]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

In [None]:
trainset = data.build_full_trainset()
svd.train(trainset)

In [None]:
rating_df[rating_df['userId'] == 2873]

In [None]:
svd.predict(2873, 2987, 3)

In [None]:
svd.predict(1, 2987, 3)

Création d'une matrice avec toutes les predictions user/movie avec surprise

In [None]:
pred_surp = pd.DataFrame(columns=['movieId', 'userId', 'rating'])

for m_index, m_row in movie_df.sample(10).iterrows():
    movieId = m_row['movieId']
    for u_index, u_row in user_df.sample(10).iterrows():
        userId = u_row['userId']
        pred = svd.predict(userId, movieId, 3)
        pred_surp.loc[len(pred_surp)] = [movieId, userId, pred.est]

In [None]:
pred_surp.head()