In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise as dist
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns


# sparsity
sparsity= round(1.0 - len(df) / float(n_users * n_items),3)*100

#Nombre d'utilisateurs et de contenu.
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]

from sklearn import cross_validation as cv

#Séparation des données en deux train_data et test_data.
train_data, test_data = cv.train_test_split(df, test_size=0.25)
train_data.head(5)

#Création de deux matrices user-item train_data_matrix et test_data_matrix.
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

#Calcule de la matrice de similarité
user_similarity = dist.cosine_similarity(train_data_matrix)




########## 

def pred_user(ratings, user_similarity, k):
    
    pred = np.zeros(ratings.shape)
    
    for u in range(ratings.shape[0]):
        
        top_k_users = [np.argsort(user_similarity[:,u])[:-k-1:-1]]
        
        for i in range(ratings.shape[1]):
            
            pred[u, i] = user_similarity[u, :][top_k_users].dot(ratings[:, i][top_k_users]) 
            pred[u, i] /= np.sum(np.abs(user_similarity[u, :][top_k_users])) + 0.000001  #Pour éviter une division par zero 
            
    return pred


def pred_item(ratings, item_similarity, k):
    
    pred = np.zeros(ratings.shape)
    
    for i in range(ratings.shape[1]):
        
        top_k_items = [np.argsort(item_similarity[:,i])[:-k-1:-1]]
        
        for u in range(ratings.shape[0]):
            
            pred[u, i] = item_similarity[i, :][top_k_items].dot(ratings[u, :][top_k_items].T) 
            pred[u, i] /= np.sum(np.abs(item_similarity[i, :][top_k_items])) + 0.000001   #Pour éviter une division par zero 
                
    return pred   


def predict_topk(ratings, similarity, kind='user', k = 40):
    
    pred = np.zeros(ratings.shape)
    
    if kind == 'user':
        
        pred = pred_user(ratings, similarity, k)
        
    if kind == 'item':
        
        pred = pred_item(ratings, similarity, k)
        
    return pred  


def get_rmse(pred, actual):
    
    
    pred = pred[actual.nonzero()].flatten() # Ignore nonzero terms.
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

###########


Test et Affichage graphique

# Test
k_array = [5, 15, 30, 50, 100, 200]
user_train_rmse = []
user_test_rmse = []
item_test_rmse = []
item_train_rmse = []

for k in k_array:
    user_pred = predict_topk(train_data_matrix, user_similarity, kind='user', k=k)
    item_pred = predict_topk(train_data_matrix, item_similarity, kind='item', k=k)
    
    user_train_rmse += [get_rmse(user_pred, train_data_matrix)]
    user_test_rmse += [get_rmse(user_pred, test_data_matrix)]
    
    item_train_rmse += [get_rmse(item_pred, train_data_matrix)]
    item_test_rmse += [get_rmse(item_pred, test_data_matrix)] 
    
# Affichage
%matplotlib inline

sns.set()

pal = sns.color_palette("Set1", 4)

plt.figure(figsize=(8, 8))
plt.plot(k_array, user_train_rmse, c=pal[0], label='User-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, user_test_rmse, c=pal[1], label='User-based test', linewidth=5)
plt.plot(k_array, item_train_rmse, c=pal[2], label='Item-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, item_test_rmse, c=pal[3], label='Item-based test', linewidth=5)
plt.legend(loc='best', fontsize=20)
plt.xticks(fontsize=16);
plt.yticks(fontsize=16);
plt.xlabel('k', fontsize=30);
plt.ylabel('RMSE', fontsize=30);
plt.legend(loc='center left', bbox_to_anchor=(1, 0.8))
    

NameError: name 'df' is not defined