In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# -------------------------------
# Step 1: Load dataset
# -------------------------------
# Download: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# After extraction, use 'ratings.csv'

ratings = pd.read_csv(r"C:\Users\Stevi\OneDrive\Documents\Projects\Projects\ML Projects\Data\ratings.csv")
display("Ratings Data:\n", ratings.head())

'Ratings Data:\n'

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# Step 2: Build user-item matrix
user_item_matrix = ratings.pivot_table(
    index="userId",
    columns="movieId",
    values="rating"
).fillna(0)

print("\nUser-Item Matrix Shape:", user_item_matrix.shape)


User-Item Matrix Shape: (610, 9724)


In [None]:
# Convert to numpy array
R = user_item_matrix.values

In [None]:
# Step 3: Apply Truncated SVD
k = 50  # number of latent factors
svd = TruncatedSVD(n_components=k, random_state=42)
R_reduced = svd.fit_transform(R)   # U * Sigma
VT = svd.components_               # V^T

print("\nReduced Representation Shape (U*Sigma):", R_reduced.shape)
print("Components Shape (VT):", VT.shape)


Reduced Representation Shape (U*Sigma): (610, 50)
Components Shape (VT): (50, 9724)


In [None]:
# Step 4: Reconstruct approximate matrix
R_pred = np.dot(R_reduced, VT)
print("\nOriginal Ratings (first 5 users, 5 movies):\n", R[:5, :5])
print("\nPredicted Ratings (first 5 users, 5 movies):\n", np.round(R_pred[:5, :5], 2))


Original Ratings (first 5 users, 5 movies):
 [[4. 0. 4. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [4. 0. 0. 0. 0.]]

Predicted Ratings (first 5 users, 5 movies):
 [[ 2.49  1.08  0.69 -0.09 -0.51]
 [ 0.28 -0.08  0.04  0.01  0.18]
 [ 0.03  0.02  0.04 -0.   -0.02]
 [ 1.8  -0.06 -0.28  0.08  0.01]
 [ 1.4   0.79  0.04  0.11  0.27]]


In [None]:
# Step 5: Evaluate model
nonzero_idx = R.nonzero()
rmse = np.sqrt(mean_squared_error(R[nonzero_idx], R_pred[nonzero_idx]))
print(f"\nRMSE on known ratings: {rmse:.4f}")


RMSE on known ratings: 1.9981


In [None]:
# Step 6: Predict ratings for a user
user_id = 0  # first user
actual_ratings = R[user_id, :]
predicted_ratings = R_pred[user_id, :]
print("\nUser 1 - Actual Ratings (first 20 movies):\n", actual_ratings[:20])
print("\nUser 1 - Predicted Ratings (first 20 movies):\n", np.round(predicted_ratings[:20], 2))


User 1 - Actual Ratings (first 20 movies):
 [4. 0. 4. 0. 0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

User 1 - Predicted Ratings (first 20 movies):
 [ 2.49  1.08  0.69 -0.09 -0.51  2.44 -0.89 -0.07  0.19  1.64 -0.71  0.19
  0.26  0.01 -0.11  1.18 -0.5   0.02  0.34  0.33]
