## 1. Exploratory Data Analysis

In [1]:
import ast
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [2]:
movie_df = pd.read_csv('data/movie.csv')

In [3]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating_df = pd.read_csv('data/rating.csv')

In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
tags_df = pd.read_csv('data/tag.csv')

In [7]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [8]:
link_df = pd.read_csv('data/link.csv')

In [9]:
link_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [10]:
genome_tags_df = pd.read_csv('data/genome_tags.csv')

In [11]:
genome_tags_df.head(10)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
5,6,1950s
6,7,1960s
7,8,1970s
8,9,1980s
9,10,19th century


In [12]:
genome_tags_df = pd.read_csv('data/genome_scores.csv')

In [13]:
genome_tags_df.head(10)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675
5,1,6,0.217
6,1,7,0.067
7,1,8,0.26275
8,1,9,0.262
9,1,10,0.032


In [14]:
# TITLE ANALYSIS.
movie_df['title'].unique()[:10]

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)',
       'Heat (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)',
       'Sudden Death (1995)', 'GoldenEye (1995)'], dtype=object)

## 2. Data Ingestion And Transformation

In [15]:
df = pd.merge(movie_df, rating_df, on = 'movieId') 

In [16]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41


In [17]:
df = pd.merge(df, link_df, on = 'movieId')

In [18]:
df = df.dropna(subset=['tmdbId'])  # Remove rows where tmdbId is NaN
df['tmdbId'] = df['tmdbId'].astype(int)

In [19]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47,114709,862
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52,114709,862
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51,114709,862
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47,114709,862
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41,114709,862


In [20]:
# DATA CLEANING
df = df.drop('timestamp', axis = 1)
df = df.drop('imdbId', axis = 1)

In [21]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,862
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,862
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,862
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,862
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,862


In [22]:
# REPLACE '|' WITH ' '
df['genres'] =  df['genres'].apply(lambda x: x.replace("|", " "))

In [23]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tmdbId
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3,4.0,862
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,6,5.0,862
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,8,4.0,862
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,10,4.0,862
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,11,4.5,862


In [24]:
df['genres'].unique()[:50]

array(['Adventure Animation Children Comedy Fantasy',
       'Adventure Children Fantasy', 'Comedy Romance',
       'Comedy Drama Romance', 'Comedy', 'Action Crime Thriller',
       'Adventure Children', 'Action', 'Action Adventure Thriller',
       'Comedy Horror', 'Adventure Animation Children', 'Drama',
       'Action Adventure Romance', 'Crime Drama', 'Drama Romance',
       'Action Comedy Crime Drama Thriller', 'Comedy Crime Thriller',
       'Crime Drama Horror Mystery Thriller', 'Drama Sci-Fi',
       'Children Drama', 'Adventure Drama Fantasy Mystery Sci-Fi',
       'Mystery Sci-Fi Thriller', 'Adventure Romance IMAX',
       'Documentary IMAX', 'Children Comedy', 'Drama War',
       'Action Crime Drama', 'Action Adventure Fantasy',
       'Comedy Drama Thriller', 'Mystery Thriller',
       'Animation Children Drama Musical Romance',
       'Crime Mystery Thriller', 'Action Drama Thriller',
       'Adventure Drama', 'Drama Mystery', 'Drama Thriller',
       'Comedy Crime', 'Acti

In [25]:
# STORE DATA(AS DATA-FRAME) IN A PICKLE-FILE.
import pickle

# OPEN THE FILE IN WRITE-BINARY MODE.
with open('artifacts/rating_data.pkl', 'wb') as f:
    pickle.dump(df, f)

In [26]:
import ast
# FUNCTION TO EXTRACT GENRES AND KEYWORD NAMES
def extract_name(obj):
    # Convert stringified list to Python object
    if isinstance(obj, str):  # Check if obj is a string
        obj = ast.literal_eval(obj)  # Safely convert string to Python list of dictionaries
    
    # Handle empty lists or None values
    if not obj:  # Check if obj is empty
        return []
    
    # Extract 'name' from each dictionary in the list
    names = [i['name'] for i in obj]
    return names


In [27]:
# PREPARE DATA FOR GENRES.
movies_df = pd.read_csv('data/tmdb_5000_movies.csv') 

movies_df['genres'] = movies_df['genres'].apply(extract_name) 
movies_df = movies_df[['id', 'original_title', 'genres']]

In [28]:
new_df = df[['tmdbId', 'title']]

In [29]:
genres_df = pd.merge(new_df, movies_df, left_on = 'tmdbId', right_on = 'id')

In [30]:
genres_df.head()

Unnamed: 0,tmdbId,title,id,original_title,genres
0,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
1,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
2,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
3,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"
4,862,Toy Story (1995),862,Toy Story,"[Animation, Comedy, Family]"


In [31]:
# CONSIDER ONE GENRE FROM LIST OF GENERES.
genres_df['genres'] = genres_df['genres'].apply(lambda x: x[0] if isinstance(x, list) and x else None)

In [32]:
genres_df = genres_df[['id', 'genres', 'original_title']]

In [33]:
genres_df.head()

Unnamed: 0,id,genres,original_title
0,862,Animation,Toy Story
1,862,Animation,Toy Story
2,862,Animation,Toy Story
3,862,Animation,Toy Story
4,862,Animation,Toy Story


In [34]:
genres_df['genres'].unique()

array(['Animation', 'Adventure', 'Comedy', 'History', 'Action', 'Drama',
       'Crime', 'Fantasy', 'Music', 'Horror', 'Thriller', 'Romance',
       'Science Fiction', 'Documentary', 'Family', 'War', 'Western',
       'Mystery', None, 'TV Movie', 'Foreign'], dtype=object)

In [35]:
genres_df['original_title'].unique()

array(['Toy Story', 'GoldenEye', 'The American President', ...,
       'The Gunman', 'Escobar: Paradise Lost', 'Get Hard'],
      shape=(4219,), dtype=object)

In [36]:
# STORE GENRES-DATA
with open('artifacts/genres_data.pkl', 'wb') as f:
    pickle.dump(genres_df, f)

In [37]:
# Load Data as Data-Frames
credits_df = pd.read_csv('data/tmdb_5000_credits.csv')
movies_df = pd.read_csv('data/tmdb_5000_movies.csv')

In [38]:
# CREDITS DATA-FRAME.
credits_df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [39]:
# MOVIES DATA-FRAME
movies_df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [40]:
# Merge Credits and Movies Data-Frames.
df = movies_df.merge(credits_df, left_on = 'id', right_on = 'movie_id')
df = df[['id', 'original_title', 'genres', 'overview', 'keywords', 'cast', 'crew']]

In [41]:
df.head(2)

Unnamed: 0,id,original_title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [42]:
df['genres'] = df['genres'].apply(extract_name) 
df['keywords'] = df['keywords'].apply(extract_name)

# Extract 5 Actors from a movie.
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:5]])

# Extract only Director-Name in Crew.
df['crew'] = df['crew'].apply(lambda x: [i['name'] for i in  ast.literal_eval(x)[:5] if i['job'] == 'Director'])


In [43]:
df.head()

Unnamed: 0,id,original_title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [44]:
# COMBINE ALL THE FEATURES, WE NEED FOR CONENT RECOMMENDATION.
df['facts'] = df['genres'] + df['keywords'] + df['cast'] + df['crew']

In [45]:
df.head()

Unnamed: 0,id,original_title,genres,overview,keywords,cast,crew,facts
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[],"[Action, Adventure, Fantasy, Science Fiction, ..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski],"[Adventure, Fantasy, Action, ocean, drug abuse..."
2,206647,Spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes],"[Action, Adventure, Crime, spy, based on novel..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan],"[Action, Crime, Drama, Thriller, dc comics, cr..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton],"[Action, Adventure, Science Fiction, based on ..."


In [46]:
# CONSIDER NOT REQUIRED FEATURES.
df = df[['id', 'original_title', 'overview', 'facts']]

In [47]:
df.head(3)

Unnamed: 0,id,original_title,overview,facts
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction, ..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action, ocean, drug abuse..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime, spy, based on novel..."


In [48]:
# REMOVE COMMAS(,) FOR DATA PREPROCESSING(ENCODING)
df['facts'] = df['facts'].apply(lambda x: ' '.join(x))

# LOWER-CASE ALL THE FEATURES.
df['facts'] = df['facts'].apply(lambda x: x.lower())
df['overview'] = df['overview'].apply(lambda x: str(x).lower() if isinstance(x, str) else '')

# Download stopwords if not already done
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    # Split the text into words, filter out stopwords, and join back into a string
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])


[nltk_data] Downloading package stopwords to C:\Users\Sohail
[nltk_data]     Mohammed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
df.head()

Unnamed: 0,id,original_title,overview,facts
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...",action adventure fantasy science fiction cultu...
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...",adventure fantasy action ocean drug abuse exot...
2,206647,Spectre,a cryptic message from bond’s past sends him o...,action adventure crime spy based on novel secr...
3,49026,The Dark Knight Rises,following the death of district attorney harve...,action crime drama thriller dc comics crime fi...
4,49529,John Carter,"john carter is a war-weary, former military ca...",action adventure science fiction based on nove...


In [50]:
# VECTORIZE THE FACTS, FOR SIMILARITY SEARCH.
from sklearn.feature_extraction.text import TfidfVectorizer # CONVERTS TEXT IN VECTORS BASED ON IMPORTANCE(FREQUENCY) OF THE WORDS(TF-ID) - NEEDED FOR SIMILARY-SEARCH.

tfidf = TfidfVectorizer(stop_words = 'english') # Initialize and remove stop-words.
tfidf_matrix = tfidf.fit_transform(df['facts']) # Learns vocabulary, returns a matrix.

In [51]:
from sklearn.metrics.pairwise import cosine_similarity # TO CALCULATE SIMILARITY BETWEEN TWO VECTORS BASED ON THE ANGLE BETWEEN THEM.

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # CALCULATES SIMILARITY BETWEEN EVERY MOVIE WITH EVERY OTHER MOVIE, AND RETURNS SIMILARITY-MATRIX( BETWEEN ALL MOVIES1 > 0).

In [52]:
def get_recommendations(title, cosine_sim = cosine_sim):
    
    idx = df[df['original_title'].str.lower() == title.lower()].index[0] # GET INDEX(ROW-NUMBER) OF THE MOVIE.
    sim_scores = list(enumerate(cosine_sim[idx])) # GET ALL THE SIMILARITY-SCORES OF A GIVEN MOVIE WITH OTHER MOVIES.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse = True) # SORT SIMILARITY SCORES. 
    sim_scores = sim_scores[1:11]  # GET TOP 10 MOVIES
    movie_indices = [i[0] for i in sim_scores]
    
    return df['original_title'].iloc[movie_indices] # RETRIVES TITLES OF MOVIES FROM THE DATA-FRAME USING PROVIDED INDEICES OF MOVIES.

In [53]:
# EVALUATION STEP.
print(get_recommendations('avatar'))

838                      Alien³
2403                     Aliens
47      Star Trek Into Darkness
3158                      Alien
278          Planet of the Apes
373             Mission to Mars
4332             Silent Running
1531                  Moonraker
2198                    Lockout
2015                 Spaceballs
Name: original_title, dtype: object


In [54]:
# STORE MODEL(COSINE-SIM) AND DAFAFRAME. 
import pickle # PICKEL IS A WAY TO SAVE DATA OR OBJECTA, SO WE CAN USE THEM LATER WIOTHOUT RECREATING THEM.
with open('artifacts/movie_data.pkl', 'wb') as f: # OPEN THE FILE FILE IN WRITE-BINARY MODE.
    pickle.dump((df, cosine_sim), f) # DUMP DATAFRAME AND COSINE-SIM IN A SINGLE FILE.

In [55]:
history_df = pd.DataFrame(columns=['userId', 'movieId', 'rating', 'timestamp'])

In [56]:
with open('artifacts/history_data.pkl', 'wb') as f:
    pickle.dump(history_df, f)

In [57]:
input_movie = 109451
title = genres_df[genres_df['id'] == input_movie]['original_title'].drop_duplicates().iloc[0]
print(title)

Cloudy with a Chance of Meatballs 2


In [66]:
merge_df = pd.merge(movie_df, rating_df, on='movieId')

In [67]:
merge_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41


In [68]:
merge_df = merge_df.drop(['genres', 'timestamp', 'title'], axis=1)

In [69]:
merge_df.head()

Unnamed: 0,movieId,userId,rating
0,1,3,4.0
1,1,6,5.0
2,1,8,4.0
3,1,10,4.0
4,1,11,4.5


In [70]:
pivot_df = df.pivot(index='userId', columns='movieId', values='rating')

KeyError: 'userId'