In [1]:
import pandas as pd


df1 = pd.read_csv('keywords.csv')
df2 = pd.read_csv('credits.csv')
df3 = pd.read_csv('links.csv')
df4 = pd.read_csv('movies_metadata.csv', low_memory=False)

df4 = df4[df4['id'].apply(lambda x: x.isnumeric())]
df4['id'] = df4['id'].astype(int)

df3 = df3[df3['tmdbId'].notnull()]
links = df3['tmdbId'].astype(int)

df4 = df4[df4['id'].isin(links)]

merged = df4.merge(df1, on='id', how='left')
merged = merged.merge(df2, on='id', how='left')

master_dataset = merged

print(master_dataset.shape)
print(master_dataset[['id', 'genres', 'keywords', 'cast']].head())

(46629, 27)
      id                                             genres  \
0    862  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1   8844  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  15602  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  31357  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4  11862                     [{'id': 35, 'name': 'Comedy'}]   

                                            keywords  \
0  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...   
1  [{'id': 10090, 'name': 'board game'}, {'id': 1...   
2  [{'id': 1495, 'name': 'fishing'}, {'id': 12392...   
3  [{'id': 818, 'name': 'based on novel'}, {'id':...   
4  [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...   

                                                cast  
0  [{'cast_id': 14, 'character': 'Woody (voice)',...  
1  [{'cast_id': 1, 'character': 'Alan Parrish', '...  
2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...  
3  [{'cast_id': 1, 'character': "Savannah 'Vannah...

In [2]:
import ast

def get_director(crew_str):
    try:
        crew_list = ast.literal_eval(crew_str)
        for member in crew_list:
            if member.get('job') == 'Director':
                return member.get('name', '')
    except:
        return ''
    return ''

df2['director'] = df2['crew'].apply(get_director)


In [3]:
merged = df4.merge(df1, on='id', how='left')
merged = merged.merge(df2, on='id', how='left')
master_dataset = merged


In [4]:
for col in ['genres', 'keywords', 'cast', 'director']:
    master_dataset[col] = master_dataset[col].fillna("")


In [5]:
print(master_dataset.shape)


(46629, 28)


In [6]:
print(master_dataset.columns)


Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'keywords', 'cast', 'crew', 'director'],
      dtype='object')


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tabulate import tabulate

In [8]:
for col in ['genres', 'keywords', 'cast', 'director']:
    master_dataset[col] = master_dataset[col].fillna("")


In [9]:
def combine_features(row):
    return f"{row['genres']} {row['keywords']} {row['cast']} {row['director']}"

In [10]:
master_dataset['combined_features'] = master_dataset.apply(combine_features, axis=1)

In [11]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(master_dataset['combined_features'])

In [12]:
cosine_sim = cosine_similarity(count_matrix)

# === STEP 5: Map titles to their index for lookup ===
master_dataset['title_cleaned'] = master_dataset['title'].str.lower().str.strip()
title_index = pd.Series(master_dataset.index, index=master_dataset['title_cleaned'])


In [17]:
def get_recommendations(movie_name, top_n=10):
    movie_name = movie_name.strip().lower()

    if movie_name not in title_index:
        return f"❌ '{movie_name}' not found in the dataset."

    idx = title_index[movie_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    movie_indices = [i[0] for i in sim_scores]
    recommended = master_dataset.iloc[movie_indices][['title', 'director', 'release_date']].copy()
    recommended.columns = ['Movie Title', 'Director', 'Release Date']
    
    return tabulate(recommended.fillna("Unknown"), headers="keys", tablefmt="psql", showindex=False)

In [18]:
# Clean titles and map to index
master_dataset['title_cleaned'] = master_dataset['title'].str.lower().str.strip()
title_index = pd.Series(master_dataset.index, index=master_dataset['title_cleaned'])


In [27]:
from tabulate import tabulate

def get_recommendations(movie_name, top_n=10):
    movie_name_cleaned = movie_name.lower().strip()
    
    if movie_name_cleaned not in title_index:
        return f"❌ Movie '{movie_name}' not found in the dataset."

    # Step 1: Get index of the movie
    movie_idx = title_index[movie_name_cleaned]

    # Step 2: Get similarity scores for this movie
    sim_scores = list(enumerate(cosine_sim[movie_idx]))

    # Step 3: Sort movies by similarity (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Step 4: Get movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Step 5: Fetch relevant movie info
    recommended = master_dataset.iloc[movie_indices][['title', 'director', 'release_date']]

    return tabulate(recommended, headers="keys", tablefmt="psql", showindex=False)


In [28]:
print(get_recommendations("Avatar"))


+--------------------------------+-------------------+----------------+
| title                          | director          | release_date   |
|--------------------------------+-------------------+----------------|
| Django Unchained               | Quentin Tarantino | 2012-12-25     |
| American Gangster              | Ridley Scott      | 2007-11-02     |
| Changeling                     | Clint Eastwood    | 2008-01-30     |
| Spider-Man                     | Sam Raimi         | 2002-05-01     |
| Assassin's Creed               | Justin Kurzel     | 2016-12-21     |
| Logan                          | James Mangold     | 2017-02-28     |
| The Social Network             | David Fincher     | 2010-09-30     |
| Guardians of the Galaxy Vol. 2 | James Gunn        | 2017-04-19     |
| Batman Begins                  | Christopher Nolan | 2005-06-10     |
| The Pianist                    | Roman Polanski    | 2002-09-24     |
+--------------------------------+-------------------+----------

In [19]:
from tabulate import tabulate

def get_recommendations(movie_name, top_n=10):
    movie_name_cleaned = movie_name.lower().strip()

    if movie_name_cleaned not in title_index:
        return f"❌ Movie '{movie_name}' not found in the dataset."

    # Step 1: Get index of the movie
    idx = title_index[movie_name_cleaned]

    # Step 2: Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Step 3: Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Step 4: Get top_n most similar movie indices (skip the movie itself at index 0)
    top_indices = [i[0] for i in sim_scores[1:top_n + 1]]

    # Step 5: Prepare the data
    recommendations = master_dataset.iloc[top_indices][['title', 'director', 'release_date']]

    # Step 6: Format and return as a table
    return tabulate(recommendations, headers="keys", tablefmt="fancy_grid", showindex=False)
