In [35]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import numpy as np
import pandas as pd
import ast

import warnings
warnings.filterwarnings('ignore')

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
movies = pd.read_csv("/content/drive/MyDrive/Database/tmdb_5000_movies.csv")
credits = pd.read_csv("/content/drive/MyDrive/Database/tmdb_5000_credits.csv")

In [39]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [40]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [41]:
# merging data frames on the basis of titles
movies = movies.merge(credits, on='title')

In [42]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [43]:
# feature selection for content-based : genres, id, keywords, title, overview, cast, crew
movies = movies[['id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']]

In [44]:
movies.isnull().sum()

id          0
title       0
genres      0
keywords    0
overview    3
cast        0
crew        0
dtype: int64

In [45]:
# since overview is a big data & null values are insignificant
movies.dropna(inplace=True)

In [46]:
movies.duplicated().sum()

0

In [47]:
movies.iloc[0].crew

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [48]:
#helper functions
def fetch_name(obj) :
  li = []
  for i in ast.literal_eval(obj):
    li.append(i['name'])
  return li

def fetch_top5(obj) :
  li = []
  count = 0
  for i in ast.literal_eval(obj):
    if(count == 5): break
    li.append(i['name'])
    count += 1
  return li

def fetch_dir(obj) :
  li = []
  for i in ast.literal_eval(obj):
    if(i['job'] == 'Director'):
      li.append(i['name'])
  return li

In [49]:
movies['genres'] = movies['genres'].apply(fetch_name)
movies['keywords'] = movies['keywords'].apply(fetch_name)
movies['cast'] = movies['cast'].apply(fetch_top5)
movies['crew'] = movies['crew'].apply(fetch_dir)

In [50]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [51]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [52]:
movies

Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,"[Action, Crime, Thriller]","[unitedstates–mexicobarrier, legs, arms, paper...","[El, Mariachi, just, wants, to, play, his, gui...","[CarlosGallardo, JaimedeHoyos, PeterMarquardt,...",[RobertRodriguez]
4805,72766,Newlyweds,"[Comedy, Romance]",[],"[A, newlywed, couple's, honeymoon, is, upended...","[EdwardBurns, KerryBishé, MarshaDietlein, Cait...",[EdwardBurns]
4806,231617,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TVMovie]","[date, loveatfirstsight, narration, investigat...","[""Signed,, Sealed,, Delivered"", introduces, a,...","[EricMabius, KristinBooth, CrystalLowe, GeoffG...",[ScottSmith]
4807,126186,Shanghai Calling,[],[],"[When, ambitious, New, York, attorney, Sam, is...","[DanielHenney, ElizaCoupe, BillPaxton, AlanRuc...",[DanielHsia]


In [53]:
movies['tags'] = movies['genres'] + movies['cast'] + movies['crew'] + movies['overview'] + movies['keywords']

In [54]:
movies = movies[['id', 'title', 'tags']]

In [57]:
ps = PorterStemmer()
def stemming(text):
  li = []
  for i in text:
    li.append(ps.stem(i))
  return li

In [58]:
movies['tags'] = movies['tags'].apply(stemming).apply(lambda x: " ".join(x)).apply(lambda x: x.lower())

In [61]:
cv = CountVectorizer(max_features=5000, stop_words='english')
tag_vector = cv.fit_transform(movies['tags']).toarray()

In [67]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [73]:
cos_similarity = cosine_similarity(tag_vector)

In [91]:
cos_similarity.shape

(4806, 4806)

In [82]:
def recommendation(movie):
  idx = movies[movies['title'] == movie].index[0]
  dist = enumerate(cos_similarity[idx])
  recc_list = sorted(list(dist), reverse=True, key=lambda x:x[1])[1:6]
  for i in recc_list:
    print(movies.iloc[i[0]].title)

In [86]:
recommendation('The Matrix')

The Matrix Revolutions
The Matrix Reloaded
Hackers
Sleep Dealer
WarGames


In [89]:
# import pickle

# pickle.dump(movies, open('movies.pkl', 'wb'))
# pickle.dump(cos_similarity, open('cos_similarity.pkl', 'wb'))