In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib
import os

In [3]:
tmdb_df = pd.read_csv('/content/drive/MyDrive/DATA/tmdb.csv')
links_df = pd.read_csv('/content/drive/MyDrive/DATA/links.csv')
ratings_df = pd.read_csv('/content/drive/MyDrive/DATA/ratings.csv')
movies_df = pd.read_csv('/content/drive/MyDrive/DATA/movies.csv')
tags_df = pd.read_csv('/content/drive/MyDrive/DATA/tags.csv')

In [4]:
links_df['tmdbId'] = pd.to_numeric(links_df['tmdbId'], errors='coerce')
if 'id' in tmdb_df.columns and 'tmdbId' not in tmdb_df.columns:
    tmdb_df.rename(columns={'id': 'tmdbId'}, inplace=True)
tmdb_df['tmdbId'] = pd.to_numeric(tmdb_df['tmdbId'], errors='coerce')

In [5]:
movies_with_links = movies_df.merge(links_df[['movieId', 'tmdbId']], on='movieId', how='left')
movies_final_df = movies_with_links.merge(tmdb_df[['tmdbId', 'poster_path']], on='tmdbId', how='left')

rated_movie_ids = ratings_df['movieId'].unique()
movies_final_df = movies_final_df[movies_final_df['movieId'].isin(rated_movie_ids)]
movies_final_df = movies_final_df.sort_values('movieId').reset_index(drop=True)
movie_ids = movies_final_df['movieId'].values

In [6]:
movies_final_df['genres'] = movies_final_df['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(movies_final_df['genres'])

In [7]:
N_TAGS = 50
tag_counts = tags_df['tag'].value_counts()
top_tags = tag_counts.head(N_TAGS).index
movie_tag_counts = tags_df[tags_df['tag'].isin(top_tags)].groupby(['movieId', 'tag']).size().unstack(fill_value=0)
movie_tag_counts = movie_tag_counts.reindex(movie_ids).fillna(0).values

In [8]:
movie_avg_ratings = ratings_df.groupby('movieId')['rating'].mean()
global_mean = ratings_df['rating'].mean()
avg_ratings = np.array([movie_avg_ratings.get(mid, global_mean) for mid in movie_ids]).reshape(-1, 1)
scaler_avg = StandardScaler()
avg_ratings_scaled = scaler_avg.fit_transform(avg_ratings)

In [9]:
content_features = np.hstack([genre_features, movie_tag_counts, avg_ratings_scaled])

In [10]:
movie_id_to_idx = {mid: i for i, mid in enumerate(movie_ids)}
user_ids = ratings_df['userId'].unique()
user_id_to_idx = {uid: i for i, uid in enumerate(user_ids)}

rows = ratings_df['movieId'].map(movie_id_to_idx).dropna().astype(int)
cols = ratings_df['userId'].map(user_id_to_idx).dropna().astype(int)
data = ratings_df['rating'].values
ratings_matrix = csr_matrix((data, (rows, cols)), shape=(len(movie_ids), len(user_ids)))

In [11]:
svd = TruncatedSVD(n_components=20, random_state=42)
collab_features = svd.fit_transform(ratings_matrix)

scaler_cf = StandardScaler()
collab_features = scaler_cf.fit_transform(collab_features)

In [12]:
feature_matrix = np.hstack([content_features, collab_features])

In [13]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(feature_matrix)

In [28]:
final_df = movies_final_df[['movieId', 'title', 'poster_path', 'genres']]

In [29]:
save_dict = {
    'df': final_df,
    'feature_matrix': feature_matrix,
    'knn_model': knn_model,
    'movie_ids': movie_ids,
    'svd': svd,
    'scaler_cf': scaler_cf,
    'scaler_avg': scaler_avg,
    'mlb': mlb
}
joblib.dump(save_dict, '/content/drive/MyDrive/DATA/recommender_model.joblib', compress=3)

['/content/drive/MyDrive/DATA/recommender_model.joblib']

In [30]:
file_size = os.path.getsize('/content/drive/MyDrive/DATA/recommender_model.joblib') / (1024 * 1024)
print(f"Saved model file size: {file_size:.2f} MB")

Saved model file size: 60.93 MB
