# Assignment Part 2:  Question 1

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import cdist
from tqdm import tqdm

In [21]:
# Replace the URL below with the raw URL of your CSV file
url = 'https://github.com/shirleymsassignments/Unsupervisedlearning/blob/main/main_data.csv'
data = pd.read_csv(url, sep='\t', error_bad_lines=False)  # For TSV files

In [24]:
print(data)

                                        <!DOCTYPE html>
0                                                 <html
1                                             lang="en"
2       data-color-mode="auto" data-light-theme="lig...
3       data-a11y-animated-images="system" data-a11y...
4                                                     >
...                                                 ...
1601                                             </div>
1602      <div id="js-global-screen-reader-notice" c...
1603      <div id="js-global-screen-reader-notice-as...
1604                                            </body>
1605                                            </html>

[1606 rows x 1 columns]


In [10]:
class PCA:
    def __init__(self, target_explained_variance=None):
        self.target_explained_variance = target_explained_variance
        self.feature_size = -1

    def standardize(self, X):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        return scaler.fit_transform(X)

    def compute_mean_vector(self, X_std):
        return np.mean(X_std, axis=0)

    def compute_cov(self, X_std, mean_vec):
        m = X_std.shape[0]
        X_centered = X_std - mean_vec
        return (X_centered.T @ X_centered) / (m - 1)

    def compute_eigen_vector(self, cov_mat):
        eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
        return eigen_values, eigen_vectors

    def compute_explained_variance(self, eigen_vals):
        total = np.sum(eigen_vals)
        explained_variance = eigen_vals / total
        return explained_variance

    def cumulative_sum(self, var_exp):
        return np.cumsum(var_exp)

    def compute_weight_matrix(self, eig_pairs, cum_var_exp):
        cum_var_exp = np.array(cum_var_exp)
        num_components = np.argmax(cum_var_exp >= self.target_explained_variance)
        matrix_w = np.hstack(
            [eig_pairs[i][1].reshape(-1, 1) for i in range(num_components)]
        )
        return matrix_w

    def transform_data(self, X_std, matrix_w):
        return X_std.dot(matrix_w)

    def fit(self, X):
        self.feature_size = X.shape[1]
        X_std = self.standardize(X)
        mean_vec = self.compute_mean_vector(X_std)
        cov_mat = self.compute_cov(X_std, mean_vec)
        eigen_vals, eigen_vecs = self.compute_eigen_vector(cov_mat)
        explained_variance = self.compute_explained_variance(eigen_vals)
        cum_var_exp = self.cumulative_sum(explained_variance)
        eig_pairs = [(eigen_vals[i], eigen_vecs[:, i]) for i in range(len(eigen_vals))]
        eig_pairs.sort(key=lambda x: x[0], reverse=True)
        matrix_w = self.compute_weight_matrix(eig_pairs, cum_var_exp)
        return self.transform_data(X_std=X_std, matrix_w=matrix_w)


Unnamed: 0,mID,title,year,Doc,Com,Hor,Adv,Wes,Dra,Ani,...,Chi,Cri,Thr,Sci,Mys,Rom,Fil,Fan,Act,Mus
0,1,Toy Story,1995,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
MV_users.head()

Unnamed: 0,uID,gender,age,accupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


Using matrix factorization technique , SVD Singular Value Decomposition to predict the missing ratings and evauluate model performance using RMSE

In [17]:
#Data Preprocessing

#We need to create a user-item matrix (matrix of ratings), where rows correspond to users, columns correspond to movies, and the values represent the ratings given by users to movies. 
#Missing ratings will be filled with zeros or a specific placeholder.

# Create user-item matrix for the training data
train_matrix = train.pivot(index='uID', columns='mID', values='rating').fillna(0)

# Perform matrix factorization using NMF
nmf = NMF(n_components=50, random_state=42)  # 50 latent features
W = nmf.fit_transform(train_matrix)  # User matrix
H = nmf.components_  # Item matrix

# Reconstruct the matrix (approximating the missing values)
reconstructed_matrix = np.dot(W, H)

# Convert the reconstructed matrix into a DataFrame
reconstructed_df = pd.DataFrame(reconstructed_matrix, columns=train_matrix.columns, index=train_matrix.index)

# Handle test data prediction with missing user-item pairs gracefully
predictions = []
for _, row in test.iterrows():
    uid, mid = row['uID'], row['mID']
    
    # Check if both uid and mid exist in the reconstructed matrix
    if uid in reconstructed_df.index and mid in reconstructed_df.columns:
        predicted_rating = reconstructed_df.loc[uid, mid]
    else:
        # If the pair is missing, use the average rating of all movies for this user
        predicted_rating = reconstructed_df.loc[uid].mean() if uid in reconstructed_df.index else reconstructed_df.mean().mean()
    
    predictions.append(predicted_rating)

# Calculate RMSE (Root Mean Squared Error) between predicted ratings and actual ratings
actual_ratings = test['rating'].values
rmse = sqrt(mean_squared_error(actual_ratings, predictions))

print(f'RMSE: {rmse}')

RMSE: 2.9135255789628096


# Question 2

# NMF performance compared to simple baseline or similarity-based methods

So , RMSE of non-negative matrix factorization :2.9135255789628096
and 
RMSE of Baseline model 1.2642784503423288 and Content-Based are 1.1962537249116723. The lower the RMSE, the better the model is at predicting ratings. 

This dataset contains sparse user-item ratings,meaning most users have rated only a small subset of available movies.

Moreover,with many unique users (uID) and movies (mID), the matrix can be very large, making it computationally challenging to handle with matrix factorization methods like NMF unless you have enough data for training. 

Baseline methods  and similarity-based methods worked better because they leverage the observed ratings more effectively and are less sensitive to sparsity