<a href="https://colab.research.google.com/github/remixwithkj/Backupmac/blob/main/factorization/MovieLens_Analysis_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Matrix Factorization

In [None]:
import pandas as pd
import numpy as np

## Reading Ratings Data

In [None]:
ratings_df = pd.read_csv('https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/factorization/u.data',
                         sep = '\t')

In [None]:
ratings_df

In [None]:
ratings_df.columns = ['userid', 'movieid', 'rating', 'timestamp']

In [None]:
ratings_df

In [None]:
len(ratings_df.userid.unique())

In [None]:
len(ratings_df.movieid.unique())

## Reading the movies metadata

In [None]:
movies_df = pd.read_csv('https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/factorization/u.item',
                        encoding = 'iso-8859-1',
                        sep = '|',
                        header = None,
                        usecols=[0, 1])

In [None]:
movies_df

In [None]:
movies_df.columns = ['movieid', 'moviename']

## Creating user-movies ratings matrix

In [None]:
user_movies_df = ratings_df.pivot( index='userid',
                                 columns='movieid',
                                 values = "rating" ).reset_index(drop=True)
user_movies_df.index = ratings_df.userid.unique()

In [None]:
user_movies_df

### Matrix Factorization Methods

In [None]:
import numpy as np

def als_matrix_factorization(R, num_features, lambda_reg, iterations):
    """
    Perform matrix factorization using Alternating Least Squares (ALS) on the
    incomplete matrix R with NaN values and return the error.

    R: the input matrix with NaNs
    num_features: the number of latent features
    lambda_reg: the regularization parameter
    iterations: the number of iterations to perform
    """

    num_users, num_items = R.shape
    W = np.random.rand(num_users, num_features)
    H = np.random.rand(num_items, num_features).T

    mask = ~np.isnan(R)

    errors = []

    for _ in range(iterations):
        # Update W
        for i in range(num_users):
            H_i = H[:, mask[i, :]]
            R_i = R[i, mask[i, :]]
            W[i, :] = np.linalg.solve(H_i @ H_i.T + lambda_reg * np.eye(num_features), H_i @ R_i)

        # Update H
        for j in range(num_items):
            W_j = W[mask[:, j], :]
            R_j = R[mask[:, j], j]
            H[:, j] = np.linalg.solve(W_j.T @ W_j + lambda_reg * np.eye(num_features), W_j.T @ R_j)

        # Calculate the reconstruction error
        R_hat = W @ H
        error = np.nansum((R - R_hat)**2 * mask)
        errors.append(np.sqrt(error))

    return W, H.T, np.round(np.sqrt(errors), 4)

## Factorizing User-Movies Ratings Matrix

In [None]:
num_features = 20
lambda_reg = 0.1
iterations = 200

W, H, errors = als_matrix_factorization(user_movies_df.to_numpy(), num_features, lambda_reg, iterations)

print("W (User Feature Matrix):")
print(W)
print("\nH (Item Feature Matrix):")
print(H)

In [None]:
errors

In [None]:
W.shape

In [None]:
H.shape

## Finding Similarity

In [None]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

movies_sim = 1 - pairwise_distances( H, metric="cosine" )
movies_sim_df = pd.DataFrame( movies_sim )

In [None]:
def get_similar_movies( movieid, topN = 5 ):
    movieidx = movies_df[movies_df.movieid == movieid].index[0]
    movies_df['similarity'] = movies_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
    return top_n

In [None]:
movies_sim_df

## Finding Similar Movies

In [None]:
movies_df[movies_df.movieid == 127]

In [None]:
get_similar_movies(127)

In [None]:
get_similar_movies(222)

In [None]:
get_similar_movies(88)

In [None]:
movies_df[movies_df.moviename.str.contains("Gump")]

In [None]:
get_similar_movies(82)