Import libraries

In [2]:
import numpy as np
import pandas as pd
import ast

Load the datasets

In [3]:
movies= pd.read_csv('tmdb_5000_movies.csv') 
credits= pd.read_csv('tmdb_5000_credits.csv') 

Merge two datasets

In [4]:
movies= movies.merge(credits,on='title')

Keep only the necessary columns

In [5]:
movies= movies[['id','genres','overview','title','keywords','cast','crew']]

Preprocessiong:

- Remove missing data
- Remove duplicate data
- Take only the name of genres, keywords from the dictionary of genres, keywords
- Take top 5 casts' names from the dictionary of cast
- Take director names from crew
- Make overviews a list for the ease of concatenation later
- Remove the spaces between names of genres, casts, crew to avoid duplication


In [6]:
#Remove missing data
movies.isnull().sum()

id          0
genres      0
overview    3
title       0
keywords    0
cast        0
crew        0
dtype: int64

In [7]:
movies.dropna(inplace=True)
#3 movies are dropped here 

In [8]:
#Remove duplicate data
movies.duplicated().sum()
#no duplicate data is found here

0

In [9]:
#Take only the name of genres from the dictionary of genres
def pick_names(obj):
  names=[]
  for i in ast.literal_eval(obj):
    names.append(i['name'])
  return names

In [10]:
movies['genres']= movies['genres'].apply(pick_names)

In [11]:
movies.head()

Unnamed: 0,id,genres,overview,title,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,Spectre,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,The Dark Knight Rises,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...",John Carter,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [12]:
#Take only the name of keywords from the dictionary of genres
movies['keywords']= movies['keywords'].apply(pick_names)
movies.head()

Unnamed: 0,id,genres,overview,title,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...",Avatar,"[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,Spectre,"[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...",John Carter,"[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [13]:
#Take top 5 casts' names from the dictionary of cast
def pick_cast(obj):
  names=[]
  count=0
  for i in ast.literal_eval(obj):
    if count!=5:
      names.append(i['name'])
      count+=1
    else:
      break
  return names

In [14]:
movies['cast']= movies['cast'].apply(pick_cast)

In [15]:
movies.head()

Unnamed: 0,id,genres,overview,title,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...",Avatar,"[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,Spectre,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...",John Carter,"[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [16]:
movies['crew'][0]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [17]:
#Take director name
def pick_director(obj):
  names=[]
  for i in ast.literal_eval(obj):
    if i['job']=='Director':
      names.append(i['name'])
      break
  return names

In [18]:
movies['crew']=movies['crew'].apply(pick_director)

In [19]:
movies.head()

Unnamed: 0,id,genres,overview,title,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...",Avatar,"[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,Spectre,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...",John Carter,"[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [20]:
#Make overview a list
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [21]:
movies.head()

Unnamed: 0,id,genres,overview,title,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...",Spectre,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[John, Carter, is, a, war-weary,, former, mili...",John Carter,"[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [22]:
#Remove the spaces between names of genres, casts, crew
movies['genres']= movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']= movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']= movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']= movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])
movies.head()

Unnamed: 0,id,genres,overview,title,keywords,cast,crew
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...",Spectre,"[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,"[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,"[Action, Adventure, ScienceFiction]","[John, Carter, is, a, war-weary,, former, mili...",John Carter,"[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


In [23]:
#make a new column tags 
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']
movies.head()

Unnamed: 0,id,genres,overview,title,keywords,cast,crew,tags
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...",Spectre,"[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,"[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,"[Action, Adventure, ScienceFiction]","[John, Carter, is, a, war-weary,, former, mili...",John Carter,"[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [24]:
#make new data frame with necessary columns
new_movies= movies[['id','title','tags']]
new_movies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [25]:
#make tags a string
new_movies['tags']=new_movies['tags'].apply(lambda x:" " .join(x))
new_movies.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags']=new_movies['tags'].apply(lambda x:" " .join(x))


Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [26]:
new_movies['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez JamesCameron'

In [27]:
new_movies['tags']=new_movies['tags'].apply(lambda x:x.lower())
new_movies['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags']=new_movies['tags'].apply(lambda x:x.lower())


'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez jamescameron'

Making each movie a vector

In [None]:
#Vectorizing words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [29]:
vector = cv.fit_transform(new_movies['tags']).toarray()
vector.shape

(4806, 5000)

In [30]:
cv.get_feature_names()



['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1970s',
 '1980',
 '1980s',
 '1985',
 '1990s',
 '19th',
 '19thcentury',
 '20',
 '200',
 '2009',
 '20th',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '60s',
 '70',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'abandoned',
 'abducted',
 'abigailbreslin',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abuse',
 'abusive',
 'academy',
 'accept',
 'accepted',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accomplish',
 'account',
 'accountant',
 'accused',
 'ace',
 'achieve',
 'act',
 'acting',
 'action',
 'actionhero',
 'actions',
 'activist',
 'activities',
 'activity',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adambrody',
 'adams',
 'adamsandler',
 'adamscott',
 'adaptation',
 'adapted',
 'addict',
 'addicted',
 'addiction',
 'adolescence',
 'adopt',
 '

In [32]:
!pip install nltk



In [31]:
import nltk

In [32]:
from nltk.stem.porter import PorterStemmer
ps =PorterStemmer()

In [33]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [34]:
new_movies['tags']=new_movies['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags']=new_movies['tags'].apply(stem)


In [43]:
vectors = cv.fit_transform(new_movies['tags']).toarray()
cv.get_feature_names()

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1940',
 '1950',
 '1950s',
 '1960',
 '1960s',
 '1970',
 '1970s',
 '1980',
 '1985',
 '1990',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2009',
 '20th',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '70',
 '80',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'abandon',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'aboard',
 'abov',
 'abus',
 'academi',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'account',
 'accus',
 'ace',
 'achiev',
 'act',
 'action',
 'actionhero',
 'activ',
 'activist',
 'activities',
 'actor',
 'actress',
 'actual',
 'adam',
 'adambrodi',
 'adamsandl',
 'adamscott',
 'adamshankman',
 'adapt',
 'add',
 'addict',
 'adjust',
 'admir',
 'admit',
 'adolesc',
 'adopt',
 'ador',
 'adrienbrodi',
 'adult',
 'adultanim',
 'adulteri',
 'adulthood',
 'advanc',
 'adventur',

Measure Cosine distance. The more the distance the less similar the movies are.

In [39]:
import sklearn

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
similarity= cosine_similarity(vectors)

In [46]:
similarity[0]

array([1.        , 0.08006408, 0.08492078, ..., 0.04441156, 0.        ,
       0.        ])

Take top 10 similar movies

In [57]:
def recommend(movie):
    movie_index= new_movies[new_movies['title']== movie].index[0] 
    distances = similarity[movie_index]
    rec_movies= sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:11]
    
    for i in rec_movies:
        print(new_movies.iloc[i[0]].title)

In [60]:
recommend('The Hunger Games')

The Hunger Games: Catching Fire
The Hunger Games: Mockingjay - Part 2
The Hunger Games: Mockingjay - Part 1
Indie Game: The Movie
The Indian in the Cupboard
Wreck-It Ralph
Circle
Epic
Nerve
Stay Alive


Get existing movie names

In [82]:
def name_suggestion(name_part):
    existing_movies=[]
    for i in new_movies['title']:
        if name_part in i:
            existing_movies.append(i)
    return existing_movies

In [86]:
name_suggestion("Bat")

['Batman v Superman: Dawn of Justice',
 'The Hobbit: The Battle of the Five Armies',
 'Battleship',
 'Batman Begins',
 'Night at the Museum: Battle of the Smithsonian',
 'Batman & Robin',
 'Batman Forever',
 'Batman Returns',
 'Battlefield Earth',
 'Battle: Los Angeles',
 'Gremlins 2: The New Batch',
 'Batman',
 'Batman',
 'Batman',
 'Batman',
 'Battle of the Year',
 'Bathory: Countess of Blood',
 'Space Battleship Yamato',
 'Cinco de Mayo: La Batalla',
 'Bats',
 'Batman: The Dark Knight Returns, Part 2',
 'Bathing Beauty',
 'Battle for the Planet of the Apes',
 'The Battle of Shaker Heights']