In [4]:
import pandas as pd
from ast import literal_eval

movies = pd.read_csv("Data/tmdb_5000_movies.csv")
credits = pd.read_csv("Data/tmdb_5000_credits.csv")
print(movies.shape, credits.shape)
movies.head()


(4803, 20) (4803, 4)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
def parse_column(x):
    try:
        return literal_eval(x)
    except:
        return []

for col in ['genres', 'keywords']:
    movies[col] = movies[col].fillna('[]').apply(parse_column)

for col in ['cast', 'crew']:
    credits[col] = credits[col].fillna('[]').apply(parse_column)

credits_subset = credits[['movie_id', 'cast', 'crew']].copy()
credits_subset.rename(columns={'movie_id': 'id'}, inplace=True)
movies = movies.merge(credits_subset, on='id', how='left')


In [6]:
def get_top_cast(cast_list, top_n=3):
    return [c['name'] for c in cast_list[:top_n] if 'name' in c]

def get_director(crew_list):
    for m in crew_list:
        if m.get('job') == 'Director':
            return m.get('name')
    return ''

movies['cast_names'] = movies['cast'].apply(lambda x: get_top_cast(x, 3) if isinstance(x, list) else [])
movies['director'] = movies['crew'].apply(lambda x: get_director(x) if isinstance(x, list) else '')
movies['genre_names'] = movies['genres'].apply(lambda g: [d['name'] for d in g] if isinstance(g, list) else [])
movies['keyword_names'] = movies['keywords'].apply(lambda k: [d['name'] for d in k] if isinstance(k, list) else [])


In [7]:
def clean_list(x):
    if isinstance(x, list):
        return [str(i).lower().replace(" ", "") for i in x]
    elif isinstance(x, str):
        return str(x).lower().replace(" ", "")
    else:
        return []

movies['cast_clean'] = movies['cast_names'].apply(clean_list)
movies['director_clean'] = movies['director'].apply(lambda x: str(x).lower().replace(" ", ""))
movies['genres_clean'] = movies['genre_names'].apply(clean_list)
movies['keywords_clean'] = movies['keyword_names'].apply(clean_list)
movies['overview_clean'] = movies['overview'].fillna('').apply(lambda x: x.lower())

def make_soup(x):
    return " ".join(x['overview_clean'].split()) + " " + " ".join(x['genres_clean']) + " " + " ".join(x['cast_clean']) + " " + x['director_clean'] + " " + " ".join(x['keywords_clean'])

movies['soup'] = movies.apply(make_soup, axis=1)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf.fit_transform(movies['soup'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(movies.index, index=movies['title'].str.lower()).drop_duplicates()


In [9]:
def recommend(title, top_n=10):
    title_lower = title.lower()
    if title_lower not in indices:
        return f"Movie '{title}' not found."
    idx = indices[title_lower]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1: top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    results = movies[['title','vote_average','vote_count','release_date']].iloc[movie_indices]
    results = results.assign(similarity=[i[1] for i in sim_scores])
    return results.reset_index(drop=True)


In [10]:
recommend("Avatar", 5)


Unnamed: 0,title,vote_average,vote_count,release_date,similarity
0,Aliens,7.7,3220,1986-07-18,0.178458
1,Falcon Rising,5.5,71,2014-09-05,0.157927
2,Battle: Los Angeles,5.5,1448,2011-03-08,0.153057
3,Apollo 18,5.0,356,2011-07-20,0.144617
4,Titan A.E.,6.3,313,2000-06-16,0.13971
