In [26]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
import ast
import gensim
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
ll = os.listdir('/content/drive/My Drive/datasets/')
movies = pd.read_csv('/content/drive/My Drive/datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/drive/My Drive/datasets/tmdb_5000_credits.csv')

In [29]:
movies.shape

(4803, 20)

In [30]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [31]:
np.intersect1d(movies.columns, credits.columns)


array(['title'], dtype=object)

In [32]:
movies = pd.merge(movies, credits, on = "title")

In [33]:
movies.shape

(4809, 23)

In [34]:
def convert(obj):
  tempList = []
  for item in ast.literal_eval(obj):
    tempList.append(item['name'])
  return tempList

In [35]:

movies = movies.reindex(columns = ['genres', 'id', 'overview', 'title','cast', 'crew', 'keywords'])

In [36]:
movies.dropna(inplace = True)

In [37]:
movies['genres'] = movies['genres'].apply(convert)

In [38]:
movies.columns

Index(['genres', 'id', 'overview', 'title', 'cast', 'crew', 'keywords'], dtype='object')

In [39]:
def convertCast(obj, columnName):
  tempList = []
  for item in ast.literal_eval(obj):
    tempList.append(item[columnName])
    if len(tempList) == 3:
      break
  return tempList

In [40]:
def getDirector(obj):
  for item in ast.literal_eval(obj):
    if(item['job'] == 'Director'):
      return item['name']

In [41]:
movies['cast'] = movies['cast'].apply(convertCast,args = ('name',))

In [42]:
movies.head(1)

Unnamed: 0,genres,id,overview,title,cast,crew,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"In the 22nd century, a paraplegic Marine is di...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."


In [43]:
movies['crew'] = movies['crew'].apply(getDirector)

In [44]:

movies['keywords'] = movies['keywords'].apply(convertCast, args = ('name',))

In [45]:
movies.head(3)

Unnamed: 0,genres,id,overview,title,cast,crew,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"In the 22nd century, a paraplegic Marine is di...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]"
1,"[Adventure, Fantasy, Action]",285,"Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]"
2,"[Action, Adventure, Crime]",206647,A cryptic message from Bond’s past sends him o...,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]"


In [46]:
def removeSpace(obj):
  tempList = []
  for item in obj:
    tempList.append(item.replace(" ",""))
  return tempList


In [47]:

movies['cast'] = movies['cast'].apply(removeSpace)
movies['crew'] = movies['crew'].str.replace(" ","")
movies.head(2)

Unnamed: 0,genres,id,overview,title,cast,crew,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"In the 22nd century, a paraplegic Marine is di...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",JamesCameron,"[culture clash, future, space war]"
1,"[Adventure, Fantasy, Action]",285,"Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",GoreVerbinski,"[ocean, drug abuse, exotic island]"


In [48]:
movies['genres'] = movies['genres'].apply(lambda x : ' '.join(x))
movies['cast'] = movies['cast'].apply(lambda x : ' '.join(x))
movies['keywords'] = movies['keywords'].apply(lambda x : ' '.join(x))

In [49]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [50]:
movies.head(1)

Unnamed: 0,genres,id,overview,title,cast,crew,keywords,tags
0,Action Adventure Fantasy Science Fiction,19995,"In the 22nd century, a paraplegic Marine is di...",Avatar,SamWorthington ZoeSaldana SigourneyWeaver,JamesCameron,culture clash future space war,"In the 22nd century, a paraplegic Marine is di..."


In [51]:
movies = movies.drop(columns = ['genres', 'overview', 'cast', 'crew', 'keywords'] , axis = 1)

In [52]:
movies.drop_duplicates(inplace = True)

In [53]:
for x in movies.index:
  if type(movies.loc[x , 'tags']) != str:
    movies.drop(x, inplace = True)

In [54]:
movies['tags'] = movies['tags'].apply(lambda x : x.lower())
movies['tags'].head(3)

0    in the 22nd century, a paraplegic marine is di...
1    captain barbossa, long believed to be dead, ha...
2    a cryptic message from bond’s past sends him o...
Name: tags, dtype: object

In [55]:
# movies['tags'] = movies['tags'].apply(lambda x : word_tokenize(x))
# movies.head(2)

In [56]:

def stem(text):
  tokens = text.split()
  stemmed_tokens = [ps.stem(tk) for tk in tokens]
  return ' '.join(stemmed_tokens)

In [57]:
ps = PorterStemmer()
movies['tags'] = movies['tags'].apply(stem)
movies.head(2)

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."


In [58]:
movies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [59]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')
vector = cv.fit_transform(movies['tags']).toarray()

In [60]:
cv.get_feature_names_out()
vector.shape

(4776, 5000)

In [61]:
similiarity = cosine_similarity(vector)

In [62]:
similiarity.shape

(4776, 4776)

In [63]:
similiarity[1]

array([0.0978232 , 1.        , 0.0433555 , ..., 0.03346372, 0.        ,
       0.03771571])

In [64]:
movies.head(4)

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...


In [65]:
index = movies.index[movies['title'] == 'Batman Begins'][0]
enum = list(enumerate(similiarity[index]))
enum = sorted(enum , key = lambda x : x[1] , reverse = True)
enum = enum[1:7]
for i in enum:
  print(movies['title'].loc[i[0]])

Synecdoche, New York
Batman
Batman
Romance & Cigarettes
Wuthering Heights
