In [1]:
#importing all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")

In [2]:
#importing and reading the data
credits=pd.read_csv("tmdb_5000_credits.csv")
movies=pd.read_csv("tmdb_5000_movies.csv")

In [3]:
#preview of first few attributes of the dataset
movies.head(1).transpose()

Unnamed: 0,0
budget,237000000
genres,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
homepage,http://www.avatarmovie.com/
id,19995
keywords,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
original_language,en
original_title,Avatar
overview,"In the 22nd century, a paraplegic Marine is di..."
popularity,150.437577
production_companies,"[{""name"": ""Ingenious Film Partners"", ""id"": 289..."


In [4]:
credits.head(1)['crew'].values

array(['[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cam

In [5]:
#by default takes one attribute for summary
movies.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
budget,4803.0,29045040.0,40722390.0,0.0,790000.0,15000000.0,40000000.0,380000000.0
id,4803.0,57165.48,88694.61,5.0,9014.5,14629.0,58610.5,459488.0
popularity,4803.0,21.4923,31.81665,0.0,4.66807,12.92159,28.3135,875.5813
revenue,4803.0,82260640.0,162857100.0,0.0,0.0,19170000.0,92917190.0,2787965000.0
runtime,4801.0,106.8759,22.61193,0.0,94.0,103.0,118.0,338.0
vote_average,4803.0,6.092172,1.194612,0.0,5.6,6.2,6.8,10.0
vote_count,4803.0,690.218,1234.586,0.0,54.0,235.0,737.0,13752.0


In [6]:
# merge the datasets on the basis of either movieid(key) or title of the movie
movies = movies.merge(credits,on="title")


In [7]:
movies.shape

(4809, 23)

In [8]:
movies.head(1).transpose()

Unnamed: 0,0
budget,237000000
genres,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
homepage,http://www.avatarmovie.com/
id,19995
keywords,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
original_language,en
original_title,Avatar
overview,"In the 22nd century, a paraplegic Marine is di..."
popularity,150.437577
production_companies,"[{""name"": ""Ingenious Film Partners"", ""id"": 289..."


In [9]:
#dropping all unnecessary columns
#important columns are genres,id,keywords,title(not original title),overview,cast,crew

movies= movies[['movie_id','genres','title','overview','cast','crew','keywords']]

In [10]:
movies.head().transpose()

Unnamed: 0,0,1,2,3,4
movie_id,19995,285,206647,49026,49529
genres,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha...",A cryptic message from Bond’s past sends him o...,Following the death of District Attorney Harve...,"John Carter is a war-weary, former military ca..."
cast,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."
crew,"[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
keywords,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."


In [11]:
#in order to make a much simpler dataframe we will keep only three columns
# (movie_id,title,tags)
#tags will be created by merging genres,overview, cast and crew
#filtering the columns of genres, keywords to take out important features of the movie
#will include top 3 cast for recommendation
#hence preprocessing of the data will be required along with removal of missing and duplicate values

movies.isnull().sum() #it came 3 for overview which is not a big number hence dropped them

movie_id    0
genres      0
title       0
overview    3
cast        0
crew        0
keywords    0
dtype: int64

In [12]:
movies.dropna(inplace=True)

In [13]:
movies.duplicated().sum()

0

In [14]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [15]:
movies.head(1)['genres'].values

array(['[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'],
      dtype=object)

In [16]:
def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l
        

In [17]:
movies['genres']= movies['genres'].apply(convert)

In [18]:
movies.head()

Unnamed: 0,movie_id,genres,title,overview,cast,crew,keywords
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
1,285,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
2,206647,"[Action, Adventure, Crime]",Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name..."
3,49026,"[Action, Crime, Drama, Thriller]",The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,..."
4,49529,"[Action, Adventure, Science Fiction]",John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."


In [19]:
movies.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [20]:
movies['keywords']=movies['keywords'].apply(convert)

In [21]:
movies.head()

Unnamed: 0,movie_id,genres,title,overview,cast,crew,keywords
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[culture clash, future, space war, space colon..."
1,285,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[ocean, drug abuse, exotic island, east india ..."
2,206647,"[Action, Adventure, Crime]",Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[spy, based on novel, secret agent, sequel, mi..."
3,49026,"[Action, Crime, Drama, Thriller]",The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[dc comics, crime fighter, terrorist, secret i..."
4,49529,"[Action, Adventure, Science Fiction]",John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[based on novel, mars, medallion, space travel..."


In [22]:
movies.iloc[0].cast

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [23]:
def convertto3(obj):
    l=[]
    count=0
    for i in ast.literal_eval(obj):
        if count!=3:
            l.append(i['name'])
            count+=1
        else:
            break
    return l

In [24]:
movies['cast']=movies['cast'].apply(convertto3)

In [25]:
movies.head()

Unnamed: 0,movie_id,genres,title,overview,cast,crew,keywords
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[culture clash, future, space war, space colon..."
1,285,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[ocean, drug abuse, exotic island, east india ..."
2,206647,"[Action, Adventure, Crime]",Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[spy, based on novel, secret agent, sequel, mi..."
3,49026,"[Action, Crime, Drama, Thriller]",The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[dc comics, crime fighter, terrorist, secret i..."
4,49529,"[Action, Adventure, Science Fiction]",John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[based on novel, mars, medallion, space travel..."


In [26]:
movies.iloc[0].crew

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [27]:
def getdirector(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job']=="Director":
            l.append(i['name'])
            break
    return l

In [28]:
movies['crew']=movies['crew'].apply(getdirector)

In [29]:
movies.head()

Unnamed: 0,movie_id,genres,title,overview,cast,crew,keywords
0,19995,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon..."
1,285,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[ocean, drug abuse, exotic island, east india ..."
2,206647,"[Action, Adventure, Crime]",Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[spy, based on novel, secret agent, sequel, mi..."
3,49026,"[Action, Crime, Drama, Thriller]",The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[dc comics, crime fighter, terrorist, secret i..."
4,49529,"[Action, Adventure, Science Fiction]",John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[based on novel, mars, medallion, space travel..."


In [30]:
movies= movies[['movie_id','title','genres','cast','crew','overview','keywords']]

In [31]:
movies.head() #after all the preprocessing

Unnamed: 0,movie_id,title,genres,cast,crew,overview,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],Following the death of District Attorney Harve...,"[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"John Carter is a war-weary, former military ca...","[based on novel, mars, medallion, space travel..."


In [32]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [33]:
movies.head()

Unnamed: 0,movie_id,title,genres,cast,crew,overview,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[In, the, 22nd, century,, a, paraplegic, Marin...","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[A, cryptic, message, from, Bond’s, past, send...","[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[Following, the, death, of, District, Attorney...","[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[John, Carter, is, a, war-weary,, former, mili...","[based on novel, mars, medallion, space travel..."


In [34]:
#transforming the data of attributes into one to form a tag
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])


In [35]:
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [36]:
movies.head()

Unnamed: 0,movie_id,title,genres,cast,crew,overview,keywords
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad..."
2,206647,Spectre,"[Action, Adventure, Crime]","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send...","[spy, basedonnovel, secretagent, sequel, mi6, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney...","[dccomics, crimefighter, terrorist, secretiden..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili...","[basedonnovel, mars, medallion, spacetravel, p..."


In [37]:
movies['tags']=movies['genres']+movies['cast']+movies['crew']+movies['overview']+movies['keywords']

In [38]:
movies.head()

Unnamed: 0,movie_id,title,genres,cast,crew,overview,keywords,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[Action, Adventure, Fantasy, ScienceFiction, S..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad...","[Adventure, Fantasy, Action, JohnnyDepp, Orlan..."
2,206647,Spectre,"[Action, Adventure, Crime]","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send...","[spy, basedonnovel, secretagent, sequel, mi6, ...","[Action, Adventure, Crime, DanielCraig, Christ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney...","[dccomics, crimefighter, terrorist, secretiden...","[Action, Crime, Drama, Thriller, ChristianBale..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili...","[basedonnovel, mars, medallion, spacetravel, p...","[Action, Adventure, ScienceFiction, TaylorKits..."


In [39]:
#the final dataframe 
new_movies=movies[['movie_id','title','tags']]

In [40]:
new_movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, S..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, JohnnyDepp, Orlan..."
2,206647,Spectre,"[Action, Adventure, Crime, DanielCraig, Christ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller, ChristianBale..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction, TaylorKits..."


In [41]:
#converting list into strings
new_movies['tags']=new_movies['tags'].apply(lambda x:" ".join(x))

In [42]:
new_movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,Action Adventure Fantasy ScienceFiction SamWor...
1,285,Pirates of the Caribbean: At World's End,Adventure Fantasy Action JohnnyDepp OrlandoBlo...
2,206647,Spectre,Action Adventure Crime DanielCraig ChristophWa...
3,49026,The Dark Knight Rises,Action Crime Drama Thriller ChristianBale Mich...
4,49529,John Carter,Action Adventure ScienceFiction TaylorKitsch L...


In [43]:
#converting to lowercase (Recommended)
new_movies['tags']=new_movies['tags'].apply(lambda x:x.lower())

In [44]:
new_movies.head().transpose()

Unnamed: 0,0,1,2,3,4
movie_id,19995,285,206647,49026,49529
title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter
tags,action adventure fantasy sciencefiction samwor...,adventure fantasy action johnnydepp orlandoblo...,action adventure crime danielcraig christophwa...,action crime drama thriller christianbale mich...,action adventure sciencefiction taylorkitsch l...


In [45]:
#for same words but in different forms like love,loved,loving --> love
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [46]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [47]:
new_movies['tags']=new_movies['tags'].apply(stem)

In [48]:
# to map words or phrases from vocabulary to a corresponding 
#vector of real numbers which used to find word predictions, 
#word similarities/semantics. 

In [49]:
#vectorization of textual data to find similar movies to recommend
#bag of words technique
#go to scikit learn CountVectorizer website and implement all the functions
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [50]:
vectors=cv.fit_transform(new_movies['tags']).toarray()

In [51]:
vectors #all movies have been now converted to vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [52]:
cv.get_feature_names_out() #it returns the features after stemming from the tags
#function name has changed from get_feature_names

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [53]:
#Now in order to recommend similar movies we will find distance of all the movie vectors with respect to each other 
#close distance movies (which has most number of similar words in their tags) will be used for recommendations
#cosine distance will be used instead of euclidean distance
#Euclidean distance fails to give appropriate and accurate results when dealing with high dimentional data

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
similarity=cosine_similarity(vectors)

In [56]:
cosine_similarity(vectors).shape #all movie similarity/distance is calculated with all movies

(4806, 4806)

In [57]:
similarity[0]  #diagonal will always be one

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ])

In [58]:
#to recommend first/top 5 movies almost similar 
# 1) take in the movie and find the index of that movie in the datafram
# 2) look up the similarity matrix of that movie using that index
# store in the index value of all the similarity matrix elements to help relate with the movie that it gets measured with
#for this we use enumerate function
# 3) sort the similarity matrix in terms of ascending order of their similarities or distances

def recommend(movie):
    mindex= new_movies[new_movies['title']==movie].index[0]
    distance= similarity[mindex]
    mlist= sorted(list(enumerate(distance)),reverse=True, key=lambda x:x[1])[1:6]
    for i in mlist:
        print (new_movies.iloc[i[0]].title)

In [59]:
#an example recommending 5 movies according to closest distance
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf
