In [1]:
import ast
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [2]:
# Load Data as Data-Frames
credits_df = pd.read_csv('data/tmdb_5000_credits.csv')
movies_df = pd.read_csv('data/tmdb_5000_movies.csv')

In [3]:
# CREDITS DATA-FRAME.
credits_df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
# MOVIES DATA-FRAME
movies_df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
# Merge Credits and Movies Data-Frames.
df = movies_df.merge(credits_df, left_on = 'id', right_on = 'movie_id')
df = df[['id', 'original_title', 'genres', 'overview', 'keywords', 'cast', 'crew']]

#### Data Cleaning

In [6]:
# FUNCTION TO EXTRACT GENRES AND KEYWORD NAMES
def extract_name(obj):
    # Convert stringified list to Python object
    if isinstance(obj, str):  # Check if obj is a string
        obj = ast.literal_eval(obj)  # Safely convert string to Python list of dictionaries
    
    # Handle empty lists or None values
    if not obj:  # Check if obj is empty
        return []
    
    # Extract 'name' from each dictionary in the list
    names = [i['name'] for i in obj]
    return names

# NOTE: 
# ast.literal_eval - CONVERTS A STRINGS INTO ACTUAL LIST OF DICTS.

In [7]:
df['genres'] = df['genres'].apply(extract_name) 
df['keywords'] = df['keywords'].apply(extract_name)

# Extract 5 Actors from a movie.
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:5]])

# Extract only Director-Name in Crew.
df['crew'] = df['crew'].apply(lambda x: [i['name'] for i in  ast.literal_eval(x)[:5] if i['job'] == 'Director'])

In [8]:
df.head()

Unnamed: 0,id,original_title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [9]:
# COMBINE ALL THE FEATURES, WE NEED FOR CONENT RECOMMENDATION.
df['facts'] = df['genres'] + df['keywords'] + df['cast'] + df['crew']

In [10]:
df.head()

Unnamed: 0,id,original_title,genres,overview,keywords,cast,crew,facts
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[],"[Action, Adventure, Fantasy, Science Fiction, ..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski],"[Adventure, Fantasy, Action, ocean, drug abuse..."
2,206647,Spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes],"[Action, Adventure, Crime, spy, based on novel..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan],"[Action, Crime, Drama, Thriller, dc comics, cr..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton],"[Action, Adventure, Science Fiction, based on ..."


In [11]:
# CONSIDER NOT REQUIRED FEATURES.
df = df[['id', 'original_title', 'overview', 'facts']]

In [12]:
df.head(3)

Unnamed: 0,id,original_title,overview,facts
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction, ..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action, ocean, drug abuse..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime, spy, based on novel..."


In [13]:
# REMOVE COMMAS(,) FOR DATA PREPROCESSING(ENCODING)
df['facts'] = df['facts'].apply(lambda x: ' '.join(x))

# LOWER-CASE ALL THE FEATURES.
df['facts'] = df['facts'].apply(lambda x: x.lower())
df['overview'] = df['overview'].apply(lambda x: str(x).lower() if isinstance(x, str) else '')

# Download stopwords if not already done
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    # Split the text into words, filter out stopwords, and join back into a string
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

[nltk_data] Downloading package stopwords to C:\Users\Sohail
[nltk_data]     Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
df.head()

Unnamed: 0,id,original_title,overview,facts
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",action adventure fantasy science fiction cultu...
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",adventure fantasy action ocean drug abuse exot...
2,206647,Spectre,a cryptic message from bond’s past sends him o...,action adventure crime spy based on novel secr...
3,49026,The Dark Knight Rises,following the death of district attorney harve...,action crime drama thriller dc comics crime fi...
4,49529,John Carter,"john carter is a war-weary, former military ca...",action adventure science fiction based on nove...


In [15]:
# VECTORIZE THE FACTS, FOR SIMILARITY SEARCH.
from sklearn.feature_extraction.text import TfidfVectorizer # CONVERTS TEXT IN VECTORS BASED ON IMPORTANCE(FREQUENCY) OF THE WORDS(TF-ID) - NEEDED FOR SIMILARY-SEARCH.

tfidf = TfidfVectorizer(stop_words = 'english') # Initialize and remove stop-words.
tfidf_matrix = tfidf.fit_transform(df['facts']) # Learns vocabulary, returns a matrix.

In [16]:
from sklearn.metrics.pairwise import cosine_similarity # TO CALCULATE SIMILARITY BETWEEN TWO VECTORS BASED ON THE ANGLE BETWEEN THEM.

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # CALCULATES SIMILARITY BETWEEN EVERY MOVIE WITH EVERY OTHER MOVIE, AND RETURNS SIMILARITY-MATRIX( BETWEEN ALL MOVIES1 > 0).

In [17]:
def get_recommendations(title, cosine_sim = cosine_sim):
    
    idx = df[df['original_title'].str.lower() == title.lower()].index[0] # GET INDEX(ROW-NUMBER) OF THE MOVIE.
    sim_scores = list(enumerate(cosine_sim[idx])) # GET ALL THE SIMILARITY-SCORES OF A GIVEN MOVIE WITH OTHER MOVIES.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse = True) # SORT SIMILARITY SCORES. 
    sim_scores = sim_scores[1:11]  # GET TOP 10 MOVIES
    movie_indices = [i[0] for i in sim_scores]
    
    return df['original_title'].iloc[movie_indices] # RETRIVES TITLES OF MOVIES FROM THE DATA-FRAME USING PROVIDED INDEICES OF MOVIES.


In [18]:
# EVALUATION STEP.
print(get_recommendations('avatar'))

838                      Alien³
2403                     Aliens
47      Star Trek Into Darkness
3158                      Alien
278          Planet of the Apes
373             Mission to Mars
4332             Silent Running
1531                  Moonraker
2198                    Lockout
2015                 Spaceballs
Name: original_title, dtype: object


In [19]:
# STORE MODEL(COSINE-SIM) AND DAFAFRAME. 
import pickle # PICKEL IS A WAY TO SAVE DATA OR OBJECTA, SO WE CAN USE THEM LATER WIOTHOUT RECREATING THEM.
with open('artifacts/content_recommendation_data.pkl', 'wb') as f: # OPEN THE FILE FILE IN WRITE-BINARY MODE.
    pickle.dump((df, cosine_sim), f) # DUMP DATAFRAME AND COSINE-SIM IN A SINGLE FILE.