In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2469540 sha256=03e60a07ff3cb91be3b4ddcd1dd86fa2d7f160c011501a292ae77b3b8f252192
  Stored in directory: /root/.cache

In [8]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

# ---------- Utility Functions ----------
def clean_title(title):
    title = re.sub(r"\(\d{4}\)", "", title)
    title = re.sub(r"[^a-zA-Z0-9 ]", "", title)
    return title.lower().strip()

# ---------- Content-Based Components ----------
def load_tmdb_content(tmdb_movies_path, tmdb_credits_path):
    # Load and merge TMDB metadata
    movies = pd.read_csv(tmdb_movies_path)
    credits = pd.read_csv(tmdb_credits_path)
    movies = movies.merge(credits, left_on='id', right_on='movie_id')

    # Print columns to debug
    # print("Columns after merging movies and credits:")
    # print(movies.columns)

    # Extract relevant features
    def parse_list(x):
        try:
            items = eval(x)
            return " ".join([i['name'].replace(" ", "") for i in items])
        except:
            return ''

    movies['genres'] = movies['genres'].apply(parse_list)
    movies['keywords'] = movies['keywords'].apply(parse_list)
    movies['cast']     = movies['cast'].apply(lambda x: parse_list(x).split()[:5])
    movies['cast']     = movies['cast'].apply(lambda x: " ".join(x))
    # Extract director
    def get_director(x):
        try:
            crew = eval(x)
            for member in crew:
                if member['job'] == 'Director':
                    return member['name'].replace(" ", "")
        except:
            pass
        return ''
    movies['crew'] = movies['crew'].apply(get_director)

    # Combine features
    movies['combined_features'] = (
        movies['overview'].fillna('') + " " +
        movies['genres'] + " " +
        movies['keywords'] + " " +
        movies['cast'] + " " +
        movies['crew']
    )

    # Vectorize
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies['combined_features'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Prepare index mapping
    movies['clean_title'] = movies['title_x'].apply(clean_title)
    movies = movies.reset_index()
    return movies, cosine_sim

def content_recommend(movies_df, cosine_sim, title, top_n=10):
    title = clean_title(title)
    if title not in movies_df['clean_title'].values:
        raise ValueError(f"Movie '{title}' not found in TMDB metadata.")
    idx = movies_df[movies_df['clean_title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommended_ids = [i[0] for i in sim_scores]
    return movies_df.loc[recommended_ids, 'title_x'].tolist()

# ---------- Collaborative Filtering Components ----------
def load_movielens_ratings(ml_movies_path, ml_ratings_path):
    ml_movies = pd.read_csv(ml_movies_path)
    ml_ratings = pd.read_csv(ml_ratings_path)
    ml_movies['clean_title'] = ml_movies['title'].apply(clean_title)
    return ml_movies, ml_ratings

def merge_datasets(tmdb_movies, ml_movies):
    # Merge on clean_title
    merged = pd.merge(ml_movies, tmdb_movies[['clean_title', 'index']], on='clean_title', how='inner')
    merged.rename(columns={'index': 'tmdb_index'}, inplace=True)
    return merged

def train_cf_model(ratings_df):
    reader = Reader(rating_scale=(ratings_df['rating'].min(), ratings_df['rating'].max()))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
    trainset = data.build_full_trainset()
    model = SVD()
    model.fit(trainset)
    return model

# ---------- Hybrid Recommender ----------
def hybrid_recommend(user_id, tmdb_movies, cosine_sim, cf_model,
                     ml_ratings, merged_df, alpha=0.7, top_n=10):
    # Get user's rated movies
    user_data = ml_ratings[ml_ratings['userId'] == user_id]
    # If user has no history, fallback to popular content-based
    if user_data.empty:
        print(f"No ratings for user {user_id}. Providing top content-based recommendations.")
        return tmdb_movies['title_x'].tolist()[:top_n]

    # Determine user's favorite movie by highest rating
    fav_movie = user_data.sort_values('rating', ascending=False).iloc[0]
    fav_movie_id = fav_movie['movieId']

    # Check if favorite movie is in the merged dataset (meaning it's in both ML and TMDB)
    if fav_movie_id not in merged_df['movieId'].values:
        print(f"Favorite movie (movieId {fav_movie_id}) not found in merged dataset. Falling back to pure collaborative filtering.")
        # Fallback to collaborative filtering
        all_movie_ids = ml_movies['movieId'].tolist()
        unrated_movie_ids = [mId for mId in all_movie_ids if mId not in user_data['movieId'].values]
        predictions = [cf_model.predict(user_id, mId).est for mId in unrated_movie_ids]
        top_indices = np.argsort(predictions)[-top_n:][::-1]
        top_movie_ids = [unrated_movie_ids[i] for i in top_indices]
        top_titles = ml_movies[ml_movies['movieId'].isin(top_movie_ids)]['title'].tolist()
        return top_titles

    # If favorite movie is in merged dataset, proceed with hybrid approach
    tmdb_idx = merged_df[merged_df['movieId'] == fav_movie_id]['tmdb_index'].values[0]

    # Candidate movies: those in TMDB
    all_indices = list(range(len(tmdb_movies)))
    # Compute content scores relative to favorite
    content_scores = np.array(cosine_sim[tmdb_idx])

    # Collaborative scores: predict for each movieId in ml_movies
    cf_scores = []
    movie_ids = []
    for _, row in merged_df.iterrows():
        mId = row['movieId']
        if mId in user_data['movieId'].values:
            continue  # skip already rated
        pred = cf_model.predict(user_id, mId).est
        cf_scores.append(pred)
        movie_ids.append(mId)

    # Align content scores
    cont_scores_aligned = []
    for mId in movie_ids:
        idx = merged_df[merged_df['movieId'] == mId]['tmdb_index'].values[0]
        cont_scores_aligned.append(content_scores[idx])

    # Normalize scores
    cf_arr = np.array(cf_scores)
    cf_norm = (cf_arr - cf_arr.min()) / (cf_arr.max() - cf_arr.min())
    cont_arr = np.array(cont_scores_aligned)
    cont_norm = (cont_arr - cont_arr.min()) / (cont_arr.max() - cont_arr.min())

    # Hybrid score
    hybrid_score = alpha * cf_norm + (1 - alpha) * cont_norm

    # Get top N
    top_idx = np.argsort(hybrid_score)[-top_n:][::-1]
    top_movie_ids = [movie_ids[i] for i in top_idx]
    top_titles = merged_df[merged_df['movieId'].isin(top_movie_ids)]['title'].tolist()
    return top_titles

# ---------- Main Script ----------
if __name__ == '__main__':
    # Paths
    tmdb_movies_path = 'tmdb_5000_movies.csv'
    tmdb_credits_path = 'tmdb_5000_credits.csv'
    ml_movies_path = 'ml-latest-small/movies.csv'
    ml_ratings_path = 'ml-latest-small/ratings.csv'

    # Load data
    print("Loading TMDB content data...")
    tmdb_movies, cosine_sim = load_tmdb_content(tmdb_movies_path, tmdb_credits_path)
    print("Loaded TMDB data with {} movies".format(len(tmdb_movies)))

    print("Loading MovieLens data...")
    ml_movies, ml_ratings = load_movielens_ratings(ml_movies_path, ml_ratings_path)
    print(f"Loaded MovieLens: {len(ml_movies)} movies, {len(ml_ratings)} ratings")

    print("Merging datasets...")
    merged = merge_datasets(tmdb_movies, ml_movies)
    print(f"Merged dataset size: {merged.shape}")

    print("Training collaborative filtering model...")
    cf_model = train_cf_model(ml_ratings)
    print("CF model trained.")



Loading TMDB content data...
Loaded TMDB data with 4803 movies
Loading MovieLens data...
Loaded MovieLens: 9742 movies, 100836 ratings
Merging datasets...
Merged dataset size: (2838, 5)
Training collaborative filtering model...
CF model trained.


In [14]:
# Example usage
user_id = 6
print(f"Hybrid recommendations for user {user_id}:")
recs = hybrid_recommend(user_id, tmdb_movies, cosine_sim, cf_model, ml_ratings, merged, alpha=0.7, top_n=10)
for i, title in enumerate(recs, 1):
    print(f"{i}. {title}")

Hybrid recommendations for user 6:
Favorite movie (movieId 318.0) not found in merged dataset. Falling back to pure collaborative filtering.
1. Star Wars: Episode IV - A New Hope (1977)
2. Star Wars: Episode V - The Empire Strikes Back (1980)
3. Amadeus (1984)
4. Indiana Jones and the Last Crusade (1989)
5. Guess Who's Coming to Dinner (1967)
6. Solaris (Solyaris) (1972)
7. Beautiful Mind, A (2001)
8. City of God (Cidade de Deus) (2002)
9. Prisoners (2013)
10. Guardians of the Galaxy (2014)


In [None]:
!pip uninstall numpy -y
!pip install numpy==1.26.4

In [4]:
# Check if the favorite movie of user 1 from MovieLens exists in the TMDB dataset
fav_movie_id_ml = ml_ratings[ml_ratings['userId'] == user_id].sort_values('rating', ascending=False).iloc[0]['movieId']
fav_movie_title_ml = ml_movies[ml_movies['movieId'] == fav_movie_id_ml]['title'].iloc[0]
fav_movie_clean_title_ml = clean_title(fav_movie_title_ml)

print(f"User 1's favorite movie (MovieLens): {fav_movie_title_ml} (clean: {fav_movie_clean_title_ml})")

# Check if this movie exists in tmdb_movies based on clean title
if fav_movie_clean_title_ml in tmdb_movies['clean_title'].values:
    print(f"'{fav_movie_title_ml}' found in TMDB movies with clean title.")
    # Check if it's in the merged dataset
    if fav_movie_id_ml in merged['movieId'].values:
        print(f"'{fav_movie_title_ml}' found in the merged dataset.")
    else:
        print(f"'{fav_movie_title_ml}' NOT found in the merged dataset by movieId.")
else:
    print(f"'{fav_movie_title_ml}' NOT found in TMDB movies by clean title.")

User 1's favorite movie (MovieLens): M*A*S*H (a.k.a. MASH) (1970) (clean: mash aka mash)
'M*A*S*H (a.k.a. MASH) (1970)' NOT found in TMDB movies by clean title.
