In [2]:
!pip install pandas numpy


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp311-cp311-win_amd64.whl (11.3 MB)
   ---------------------------------------- 0.0/11.3 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.3 MB 10.1 MB/s eta 0:00:02
   ------------- -------------------------- 3.9/11.3 MB 14.7 MB/s eta 0:00:01
   ----------------- ---------------------- 5.0/11.3 MB 10.1 MB/s eta 0:00:01
   ------------------- -------------------- 5.5/11.3 MB 8.6 MB/s eta 0:00:01
   ---------------------- ----------------- 6.3/11.3 MB 6.7 MB/s eta 0:00:01
   -------------------------- ------------- 7.6/11.3 MB 6.5 MB/s eta 0:00:01
   ------------------------------- -------- 8.9/11.3 MB 6.4 MB/s eta 0:00:01
   ---------------------------------- ----- 9.7/11.3 MB 6.

In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

movies = movies.merge(credits, left_on='id', right_on='movie_id')

def convert(obj):
    try:
        L = []
        for item in ast.literal_eval(obj):
            L.append(item['name'])
        return L
    except:
        return []

def get_director(obj):
    try:
        L = []
        for item in ast.literal_eval(obj):
            if item['job'] == 'Director':
                L.append(item['name'])
        return L
    except:
        return []


movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['crew'] = movies['crew'].apply(get_director)


movies['cast'] = movies['cast'].apply(lambda x: x[:3])


movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])


movies['overview'] = movies['overview'].fillna("")
movies['tags'] = movies['overview'] + " " + \
                 movies['genres'].apply(lambda x: " ".join(x)) + " " + \
                 movies['keywords'].apply(lambda x: " ".join(x)) + " " + \
                 movies['cast'].apply(lambda x: " ".join(x)) + " " + \
                 movies['crew'].apply(lambda x: " ".join(x))

new_df = movies[['id', 'title_x', 'tags']]
new_df = new_df.rename(columns={"title_x": "title"})


cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

similarity = cosine_similarity(vectors)

def content_recommend(movie_name):
    try:
        idx = new_df[new_df['title'] == movie_name].index[0]
    except:
        return f"Movie '{movie_name}' not found."

    distances = sorted(list(enumerate(similarity[idx])),
                       reverse=True, key=lambda x: x[1])

    print(f"\nContent-Based Recommendations for '{movie_name}':\n")
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)


np.random.seed(42)
user_ids = [f"User_{i}" for i in range(1, 51)]
movie_ids = new_df['id'].tolist()
ratings_data = []

for user in user_ids:
    for movie in movie_ids:
        if np.random.rand() < 0.03:  # 3% users rate each movie
            ratings_data.append([user, movie, np.random.randint(1, 6)])

ratings = pd.DataFrame(ratings_data, columns=['user', 'movie_id', 'rating'])
rating_matrix = ratings.pivot_table(index='user', columns='movie_id', values='rating')
rating_matrix.fillna(0, inplace=True)


collab_sim = cosine_similarity(rating_matrix)
collab_sim_df = pd.DataFrame(collab_sim, index=user_ids, columns=user_ids)


def collaborative_recommend(user):
    if user not in rating_matrix.index:
        return "Unknown user."

    similar_users = collab_sim_df[user].sort_values(ascending=False)[1:6].index

    recommendations = []
    for u in similar_users:
        top_movies = rating_matrix.loc[u][rating_matrix.loc[u] > 3].index.tolist()
        recommendations.extend(top_movies)

    recommendations = list(set(recommendations))[:5]

    print(f"\nCollaborative Filtering Recommendations for {user}:\n")
    for movie_id in recommendations:
        title = new_df[new_df['id'] == movie_id].title.values[0]
        print(title)


def hybrid_recommend(movie_name, user):
    print("\n===== HYBRID RECOMMENDATION =====")
    content_recommend(movie_name)
    collaborative_recommend(user)


hybrid_recommend("Avatar", "User_10")
