## Algorithme de SVD 

In [1]:
import numpy as np
from scipy.sparse.linalg import svds
from functools import partial


def emsvd(Y, k=None, tol=1E-3, maxiter=5):
    """
    Approximate SVD on data with missing values via expectation-maximization

    Inputs:
    -----------
    Y:          (nobs, ndim) data matrix, missing values denoted by NaN/Inf
    k:          number of singular values/vectors to find (default: k=ndim)
    tol:        convergence tolerance on change in trace norm
    maxiter:    maximum number of EM steps to perform (default: no limit)

    Returns:
    -----------
    Y_hat:      (nobs, ndim) reconstructed data matrix
    mu_hat:     (ndim,) estimated column means for reconstructed data
    U, s, Vt:   singular values and vectors (see np.linalg.svd and 
                scipy.sparse.linalg.svds for details)
    """

    if k is None:
        svdmethod = partial(np.linalg.svd, full_matrices=False)
    else:
        svdmethod = partial(svds, k=k)
    if maxiter is None:
        maxiter = np.inf

    # initialize the missing values to their respective column means
    mu_hat = np.nanmean(Y, axis=0, keepdims=1)
    valid = np.isfinite(Y)
    Y_hat = np.where(valid, Y, mu_hat)

    halt = False
    ii = 1
    v_prev = 0

    while not halt:
        
    # SVD on filled-in data
        U, s, Vt = svdmethod(Y_hat - mu_hat)

        # impute missing values
        Y_hat[~valid] = (U.dot(np.diag(s)).dot(Vt) + mu_hat)[~valid]

        # update bias parameter
        mu_hat = Y_hat.mean(axis=0, keepdims=1)

        # test convergence using relative change in trace norm
        v = s.sum()
        if v_prev==0:
            flag=1
        else:
            flag=v_prev
        if ii >= maxiter or ((v - v_prev) / flag) < tol:
            halt = True
        ii += 1
        #print(ii)
        #print((v - v_prev) / flag)
        v_prev = v
    return Y_hat, mu_hat, U, s, Vt

In [2]:
X=pd.DataFrame(emsvd(df3,k=2)[0])

NameError: name 'pd' is not defined

In [None]:
X=pd.DataFrame(emsvd(df3,k=2)[0])

In [None]:
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
La méthode du coude nous donne qu'il faut prendre k=4 clusters

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit_predict(X)

In [None]:
X=pd.DataFrame(X)
X['Cluster']=kmeans

In [None]:
sns.pairplot(pd.DataFrame(X),hue='Cluster')

Méthode de factorisation matricielle


 On décide maintenant d'utiliser la méthode de factorisation matricielle afin de résoudre notre problème initial, cette méthode non supervisée se base sur des latent features dont on cherchera à déterminer le nombre optimal, cette méthode nous permet de prédire l'ensemble des notes que les utilisateurs donneraient aux films.

In [3]:
import random as rd

In [None]:
def data_set(dat=ratings,col='UserID',alpha=0.8):
    data=dat.copy()
    users=data[col].unique()
    random=[rd.uniform(0,1) for k in range (0,data.shape[0])]
    data['rd']=random
    training=data[data['rd']<alpha]
    for i in users:
        index=data[col]==i
        training_user=data[index].reset_index()
        training_user=training_user.loc[0]
        training_user=training_user.to_frame().T
        if training.merge(training_user,how='inner').shape[0]==0:
            training=training.merge(training_user)

        #On garde au moins une note pour chaque utilisateur 
    training=pd.concat([training,data[data['rd']<alpha]],join='outer',sort=False)
    test=data[data['rd']>alpha]
    del training['rd']
    del test['rd']
    return(training,test)

In [None]:
#On split la base de données afin de vérifier nos résultats
training,test = data_set(ratings,'UserID',0.8)

In [None]:
notes_training=training.pivot(index='UserID',columns='MovieID',values='Rating')

In [None]:
notes_test=test.pivot(index='UserID',columns='MovieID',values='Rating')

In [None]:
notes_training_standard=standard(notes_training.T,notes_training.T)
notes_test_standard=standard(notes_test.T,notes_training.T)

On cherche à optimiser la valeur de nombre de latent feature. On regarde la variation d'écart pour un jeu de donnée selon le nombre choisi.

In [None]:
erreur=[]
x=[]
for i in range (1,15):
    note_estimate=(pd.DataFrame(emsvd(notes_training_standard.astype(float),k=i)[0]))
    e=(note_estimate-notes_test_standard)**2
    erreur.append(e.mean().mean())
    x.append(i)
plt.plot(x,erreur)

In [None]:
print(min(erreur))

On choisit un modèle à 1 latent feature

In [None]:
n_features=1
notes_test_standard['Cluster']=X['Cluster']
erreur=0
for i in range(0,4):
    notes_cluster=notes_training_standard[notes_test_standard['Cluster']==i]
    #del notes_cluster['UserID']
    test_cluster=notes_test_standard[notes_test_standard['Cluster']==i]
    #del test_cluster['UserID']
    del test_cluster['Cluster']
    notes_cluster_estimate=pd.DataFrame(emsvd(notes_cluster.astype(float),k=n_features)[0])
    e=(notes_cluster_estimate-notes_cluster)**2
    erreur+=e.mean().mean()

In [None]:
print(erreur/4)