In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD

import warnings; warnings.simplefilter('ignore')

## Loading the dataset

In [2]:
movies_df=pd.read_csv('movies_metadata.csv')
credits_df=pd.read_csv('credits.csv')
ratings_df=pd.read_csv('ratings_small.csv')
keywords_df=pd.read_csv('keywords.csv')
links_df=pd.read_csv('links_small.csv')

In [3]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Cleaning Data

In [4]:
# Convert 'release_date' to datetime and extract the year
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce')
movies_df['year'] = movies_df['release_date'].dt.year

In [5]:
# Drop rows with missing 'year'
movies_df = movies_df.dropna(subset=['year'])

In [6]:
movies_df['year']=movies_df['year'].astype(int)

In [7]:
# Ensure the necessary columns are of the correct type
movies_df['vote_count'] = movies_df['vote_count'].astype(int)
movies_df['vote_average'] = movies_df['vote_average'].astype(float)
movies_df['popularity'] = movies_df['popularity'].astype(float)

## Bayesian Recommender System

The **Bayesian Recommender System** is an alternative to IMDB’s weighted rating system, providing a more robust ranking mechanism by factoring in the number of votes each movie receives. This system helps balance out movies with few votes and ensures that movies with a significant number of votes and a high average rating appear in the top recommendations.

### Bayesian Average Formula:

The formula for Bayesian Average (BA) is:

$ \text{Bayesian Average (BA)} = \frac{Rv + Cm}{v + m} $

Where:
- `R`: Average rating of the movie
- `v`: Number of votes for the movie
- `C`: Mean rating across all movies
- `m`: Minimum number of votes required to qualify (calculated dynamically)

In this system, `m` is calculated dynamically as the **95th percentile** of vote counts across all movies for the overall chart and as the **85th percentile** for genre-specific charts.

In [8]:
# Calculate the mean vote across all movies
C = movies_df['vote_average'].mean()

In [9]:
# Calculate the 95th percentile for vote_count
m = movies_df['vote_count'].quantile(0.95)

In [10]:
def bayesian_average(x, m=m, C=C):
    """Calculates bayesian average for each movie"""
    v = x['vote_count']
    R = x['vote_average']
    return (R * v + C * m) / (v + m)

In [11]:
# Filter movies that qualify for the top chart
qualified_movies = movies_df[movies_df['vote_count'] >= m].copy()

In [12]:
# Calculate the Bayesian average rating for each qualified movie
qualified_movies['bayesian_rating'] = qualified_movies.apply(bayesian_average, axis=1)

In [13]:
def top_n_movies(df,n):
    """Get the top 'n' movies that are based off the bayesian ratings"""
    return df.sort_values('bayesian_rating', ascending=False)[['title', 'year', 'vote_count', 'vote_average', 'popularity']].head(n)
    

In [14]:
top_15=top_n_movies(qualified_movies,15)

### Top 15 Movies According to Bayesian Ratings

In [15]:
top_15

Unnamed: 0,title,year,vote_count,vote_average,popularity
314,The Shawshank Redemption,1994,8358,8.5,51.645403
834,The Godfather,1972,6024,8.5,41.109264
12481,The Dark Knight,2008,12269,8.3,123.167259
2843,Fight Club,1999,9678,8.3,63.869599
292,Pulp Fiction,1994,8670,8.3,140.950236
351,Forrest Gump,1994,8147,8.2,48.307194
522,Schindler's List,1993,4436,8.3,41.725123
23673,Whiplash,2014,4376,8.3,64.29999
5481,Spirited Away,2001,3968,8.3,41.048867
1154,The Empire Strikes Back,1980,5998,8.2,19.470959


This matches up with common logic because shawshank is at the top. Understandable.

In [16]:
def convert_genres(genres):
    """Clean genres column from JSON like format to String"""
    try:
        genre_list = ast.literal_eval(genres)  # Safely evaluate string as a list
        genre_names = [genre['name'] for genre in genre_list]  # Extract genre names
        return ', '.join(genre_names)  # Join genre names into a single string
    except (ValueError, SyntaxError):
        return np.nan  # Return NaN if there's an issue with the format

In [17]:
# Clean 'genres' column
movies_df['genre'] = movies_df['genres'].apply(convert_genres)

# Drop rows where genre conversion failed
movies_df = movies_df.dropna(subset=['genre'])

In [18]:
movies_df[['title','genre']].head()

Unnamed: 0,title,genre
0,Toy Story,"Animation, Comedy, Family"
1,Jumanji,"Adventure, Fantasy, Family"
2,Grumpier Old Men,"Romance, Comedy"
3,Waiting to Exhale,"Comedy, Drama, Romance"
4,Father of the Bride Part II,Comedy


In [19]:
def genre_chart_bayesian(genre, percentile=0.85, n= 15):
    """Build Charts for a particular genre using Bayesian rating"""
    genre_movies = movies_df[movies_df['genre'].str.contains(genre, na=False)].copy()
    genre_m = genre_movies['vote_count'].quantile(percentile)
    qualified_genre_movies = genre_movies[genre_movies['vote_count'] >= genre_m]
    qualified_genre_movies['bayesian_rating'] = qualified_genre_movies.apply(bayesian_average, axis=1)
    
    # Sort by Bayesian rating and return top movies in this genre
    return qualified_genre_movies.sort_values('bayesian_rating', ascending=False)[['title', 'year', 'vote_count', 'vote_average', 'popularity']].head(n)

In [20]:
## Get the top 15 Action Movies according to Bayesian Rating Average
top_action_movies = genre_chart_bayesian('Action', percentile=0.85,n=15)
top_action_movies

Unnamed: 0,title,year,vote_count,vote_average,popularity
12481,The Dark Knight,2008,12269,8.3,123.167259
1154,The Empire Strikes Back,1980,5998,8.2,19.470959
15480,Inception,2010,14075,8.1,29.108149
7000,The Lord of the Rings: The Return of the King,2003,8226,8.1,29.324358
256,Star Wars,1977,6778,8.1,42.149697
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8.0,32.070725
5814,The Lord of the Rings: The Two Towers,2002,7641,8.0,29.423537
23753,Guardians of the Galaxy,2014,10014,7.9,53.291601
2458,The Matrix,1999,9079,7.9,33.366332
13605,Inglourious Basterds,2009,6598,7.9,16.89564


The Dark Knight. No surprises here!

## Content-Based Recommender

### Overview and Tagline into consideration

In [21]:
links_df = links_df[links_df['tmdbId'].notnull()]['tmdbId'].astype('int')

In [22]:
len(links_df)

9112

In [23]:
movies_df

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415,1995,"Animation, Comedy, Family"
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,1995,"Adventure, Fantasy, Family"
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,1995,"Romance, Comedy"
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,1995,"Comedy, Drama, Romance"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,1995,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45460,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,30840,tt0102797,en,Robin Hood,"Yet another version of the classic epic, with ...",...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Robin Hood,False,5.7,26,1991,"Drama, Action, Romance"
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3,2011,Drama
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6,2003,"Action, Drama, Thriller"
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,87.0,[],Released,,Satan Triumphant,False,0.0,0,1917,


In [24]:
movies_df['id']=movies_df['id'].astype(int)

In [25]:
small_movies_df=movies_df[movies_df['id'].isin(links_df)]

In [26]:
small_movies_df.shape

(9099, 26)

In [27]:
# Fill missing values in overview and tagline with empty strings
small_movies_df['overview'] = small_movies_df['overview'].fillna('')
small_movies_df['tagline'] = small_movies_df['tagline'].fillna('')

In [28]:
# Combine overview and tagline into one column
small_movies_df['overview_tagline'] = small_movies_df['overview'] + ' ' + small_movies_df['tagline']

In [29]:
# Create a TF-IDF Vectorizer to transform text into vectors
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0,stop_words='english')

In [30]:
# Fit and transform the combined overview and tagline column
tfidf_matrix = tfidf.fit_transform(small_movies_df['overview_tagline'])

In [31]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
small_movies_df = small_movies_df.reset_index()
titles = small_movies_df['title']
#indices = pd.Series(small_movies_df.index, index=small_movies_df['title'])

In [33]:
# Function to get movie recommendations based on overview and tagline similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = small_movies_df[small_movies_df['title'] == title].index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]  # Exclude the first movie as it will be itself
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return small_movies_df[['title', 'overview', 'tagline']].iloc[movie_indices]

In [34]:
get_recommendations('Sherlock, Jr.')

Unnamed: 0,title,overview,tagline
284,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,Fear can hold you prisoner. Hope can set you f...
8125,The Making of a Legend: Gone with the Wind,This is a documentary that revisits the making...,
2210,The Thomas Crown Affair,A very rich and successful playboy amuses hims...,How do you get the man who has everything?
2519,Stealing Home,"Billy Wyatt (Harmon), a former high school and...","Stealing hearts, stealing laughs, stealing mem..."
589,True Crime,"Mary Giordano is a bright, intelligent student...",Trust no one.
1093,The Mirror Has Two Faces,"Rose Morgan (Barbara Streisand), who still liv...",A story about just how wrong two people can be...
927,Cinema Paradiso,"A filmmaker recalls his childhood, when he fel...","A celebration of youth, friendship, and the ev..."
5808,Police Story,A virtuous Hong Kong police officer must clear...,"You may know the name, but the game has changed."
5656,"Steamboat Bill, Jr.",The just out of college effete son of a no-non...,The Laugh Special of the Age. See It.
8482,"I, Frankenstein","200 years after his shocking creation, Dr. Fra...","In the battle between good and evil, an immort..."


### Taking Cast and Crew into consideration as well

In [35]:
credits_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [36]:
keywords_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [37]:
keywords_df['id'] = keywords_df['id'].astype('int')
credits_df['id'] = credits_df['id'].astype('int')
movies_df['id'] = movies_df['id'].astype('int')

Making the final df with merged crew and keyword information

In [38]:
movies_df=movies_df.merge(keywords_df,on='id')

In [39]:
movies_df=movies_df.merge(credits_df,on='id')

In [40]:
small_movies_df=movies_df[movies_df['id'].isin(links_df)]

In [41]:
small_movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,year,genre,keywords,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415,1995,"Animation, Comedy, Family","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,1995,"Adventure, Fantasy, Family","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,1995,"Romance, Comedy","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,1995,"Comedy, Drama, Romance","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,1995,Comedy,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [42]:
small_movies_df['cast'] = small_movies_df['cast'].apply(ast.literal_eval)
small_movies_df['crew'] = small_movies_df['crew'].apply(ast.literal_eval)
small_movies_df['keywords'] = small_movies_df['keywords'].apply(ast.literal_eval)
small_movies_df['cast_size'] = small_movies_df['cast'].apply(lambda x: len(x))
small_movies_df['crew_size'] = small_movies_df['crew'].apply(lambda x: len(x))

In [43]:
small_movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,year,genre,keywords,cast,crew,cast_size,crew_size
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415,1995,"Animation, Comedy, Family","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",13,106
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,False,6.9,2413,1995,"Adventure, Fantasy, Family","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",26,16
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,False,6.5,92,1995,"Romance, Comedy","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",7,4
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,False,6.1,34,1995,"Comedy, Drama, Romance","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",10,10
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,False,5.7,173,1995,Comedy,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",12,7


In [44]:
def get_director(x):
    """Get Director Info from crew column"""
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return ''

In [45]:
small_movies_df['director']=small_movies_df['crew'].apply(get_director)

In [46]:
def get_cast(x):
    """Get the top 3 starring cast 
    cast=[]
    for i in x:
        cast.append(i['name'])
    if len(cast)>=3:
        cast=cast[:3]
    return cast

In [47]:
small_movies_df['cast_names'] = small_movies_df['cast'].apply(get_cast)

In [48]:
def get_keywords(x):
    keywords=[]
    for i in x:
        keywords.append(i['name'])
    return keywords

In [49]:
small_movies_df['keywords_info']=small_movies_df['keywords'].apply(get_keywords)

In [50]:
# Function to convert list of dictionaries to string of names (for cast, keywords)
def convert_list_to_string(data_list):
    """Convert List of Objects to String"""
    return ' ,'.join([item for item in data_list]) if isinstance(data_list, list) else ''

In [51]:
# Apply the function to convert lists in 'cast', 'keywords', 'crew'
small_movies_df['cast_names'] = small_movies_df['cast_names'].apply(lambda x: convert_list_to_string(x))  
small_movies_df['keywords_info'] = small_movies_df['keywords_info'].apply(lambda x: convert_list_to_string(x))

In [52]:
small_movies_df.reset_index(inplace=True)

In [53]:
small_movies_df

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,year,genre,keywords,cast,crew,cast_size,crew_size,director,cast_names,keywords_info
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,1995,"Animation, Comedy, Family","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",13,106,John Lasseter,"Tom Hanks ,Tim Allen ,Don Rickles","jealousy ,toy ,boy ,friendship ,friends ,rival..."
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,1995,"Adventure, Fantasy, Family","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",26,16,Joe Johnston,"Robin Williams ,Jonathan Hyde ,Kirsten Dunst","board game ,disappearance ,based on children's..."
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,1995,"Romance, Comedy","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",7,4,Howard Deutch,"Walter Matthau ,Jack Lemmon ,Ann-Margret","fishing ,best friend ,duringcreditsstinger ,ol..."
3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,...,1995,"Comedy, Drama, Romance","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",10,10,Forest Whitaker,"Whitney Houston ,Angela Bassett ,Loretta Devine","based on novel ,interracial relationship ,sing..."
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,...,1995,Comedy,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",12,7,Charles Shyer,"Steve Martin ,Diane Keaton ,Martin Short","baby ,midlife crisis ,confidence ,aging ,daugh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9214,40894,False,,8000000,"[{'id': 18, 'name': 'Drama'}]",,159550,tt0255313,en,The Last Brickmaker in America,...,2001,Drama,"[{'id': 6054, 'name': 'friendship'}, {'id': 20...","[{'cast_id': 1, 'character': 'Henry Cobb', 'cr...","[{'credit_id': '544475aac3a36819fb000578', 'de...",7,2,Gregg Champion,"Sidney Poitier ,Wendy Crewson ,Jay O. Sanders","friendship ,brick making"
9215,41114,False,,1000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 10749,...",,392572,tt5165344,hi,रुस्तम,...,2016,"Thriller, Romance","[{'id': 10540, 'name': 'bollywood'}]","[{'cast_id': 0, 'character': 'Rustom Pavri', '...","[{'credit_id': '5951baf692514129c4016600', 'de...",14,16,Tinu Suresh Desai,"Akshay Kumar ,Ileana D'Cruz ,Esha Gupta",bollywood
9216,41167,False,,15050000,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",,402672,tt3859980,hi,Mohenjo Daro,...,2016,"Adventure, Drama, History, Romance","[{'id': 10540, 'name': 'bollywood'}]","[{'cast_id': 0, 'character': 'Sarman', 'credit...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...",12,16,Ashutosh Gowariker,"Hrithik Roshan ,Pooja Hegde ,Kabir Bedi",bollywood
9217,41330,False,,15000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,315011,tt4262980,ja,シン・ゴジラ,...,2016,"Action, Adventure, Drama, Horror, Science Fiction","[{'id': 1299, 'name': 'monster'}, {'id': 7671,...","[{'cast_id': 4, 'character': 'Rando Yaguchi : ...","[{'credit_id': '560892fa92514177550018b2', 'de...",49,27,Hideaki Anno,"Hiroki Hasegawa ,Yutaka Takenouchi ,Satomi Ish...","monster ,godzilla ,giant monster ,destruction ..."


In [54]:
# Define a function to compute similarity for each feature using CountVectorizer
def get_similarity_matrix(column):
    count_vectorizer = CountVectorizer(stop_words='english')
    count_matrix = count_vectorizer.fit_transform(small_movies_df[column])
    return cosine_similarity(count_matrix, count_matrix)

In [55]:
# Calculate similarity matrices for cast, director, keywords, and genre
cast_sim = get_similarity_matrix('cast_names')
director_sim = get_similarity_matrix('director')
keywords_sim = get_similarity_matrix('keywords_info')
genre_sim = get_similarity_matrix('genre')

In [56]:
# Weights for each feature 
weights = {
    'cast': 0.9,  
    'director': 0.1,  
    'keywords': 0.0,  
    'genre': 0.0  
}


In [57]:
# Compute the final weighted similarity matrix
final_similarity = (weights['cast'] * cast_sim + 
                    weights['director'] * director_sim + 
                    weights['keywords'] * keywords_sim + 
                    weights['genre'] * genre_sim)

In [58]:
def get_weighted_recommendations(title, similarity_matrix=final_similarity):
    """Function to get movie recommendations based on weighted similarity"""
    
    idx = small_movies_df[small_movies_df['title'] == title].index[0]   
    sim_scores = list(enumerate(similarity_matrix[idx]))  
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) 
    sim_scores = sim_scores[1:11]  
    movie_indices = [i[0] for i in sim_scores]
    return small_movies_df[['title', 'cast_names', 'director', 'keywords_info', 'genre']].iloc[movie_indices]



In [59]:
get_weighted_recommendations('The Dark Knight')

Unnamed: 0,title,cast_names,director,keywords_info,genre
6218,Batman Begins,"Christian Bale ,Michael Caine ,Liam Neeson",Christopher Nolan,"himalaya ,martial arts ,dc comics ,crime fight...","Action, Crime, Drama"
6623,The Prestige,"Hugh Jackman ,Christian Bale ,Michael Caine",Christopher Nolan,"competition ,secret ,obsession ,magic ,dying a...","Drama, Mystery, Thriller"
8031,The Dark Knight Rises,"Christian Bale ,Michael Caine ,Gary Oldman",Christopher Nolan,"dc comics ,crime fighter ,terrorist ,secret id...","Action, Crime, Drama, Thriller"
5405,Educating Rita,"Michael Caine ,Julie Walters ,Michael Williams",Lewis Gilbert,,"Drama, Comedy, Romance"
1661,Newsies,"Christian Bale ,Bill Pullman ,Ann-Margret",Kenny Ortega,"juvenile crime ,child empowerment ,brawl ,boy ...","Drama, Music, Family"
498,True Romance,"Christian Slater ,Patricia Arquette ,Michael R...",Tony Scott,"father son relationship ,film producer ,mexica...","Action, Thriller, Crime, Romance"
1073,The Man Who Would Be King,"Sean Connery ,Michael Caine ,Christopher Plummer",John Huston,"robbery ,journalist ,gold ,cheating ,treasure ...","Adventure, Drama"
1660,The Muppet Christmas Carol,"Michael Caine ,Don Austen ,Meredith Braun",Brian Henson,"holiday ,future ,musical ,past ,scrooge ,chris...","Comedy, Family, Fantasy, Drama"
1802,Blame It on Rio,"Michael Caine ,Michelle Johnson ,Joseph Bologna",Stanley Donen,"female nudity ,infidelity ,rio de janeiro ,sed...","Comedy, Romance"
1877,Mona Lisa,"Bob Hoskins ,Cathy Tyson ,Michael Caine",Neil Jordan,"london england ,prostitute ,ex-detainee ,chauf...","Drama, Crime, Romance"
