In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Importing Data

##### The link for data  (https://www.kaggle.com/rounakbanik/the-movies-dataset?select=ratings.csv). Data is same that is used in Analytics and Visualisation

In [2]:
pd.set_option("display.max_columns", None)
df_ratings = pd.read_csv("ratings_small.csv",low_memory=False)
df_links = pd.read_csv("links_small.csv",low_memory=False)
df_keywords = pd.read_csv("keywords.csv",low_memory=False)
df_movies = pd.read_csv("movies_metadata.csv",low_memory=False)
df_credits = pd.read_csv("credits.csv",low_memory=False)

### Understanding data and preprocessing it

In [3]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [4]:
df_ratings = df_ratings[["movieId","rating"]]

In [5]:
movieId_one = pd.DataFrame(df_ratings.loc[df_ratings["movieId"]==1])
movieId_one.sort_values("rating")

Unnamed: 0,movieId,rating
45185,1,1.0
70496,1,1.0
73210,1,1.0
14886,1,1.0
62983,1,1.5
...,...,...
18928,1,5.0
56955,1,5.0
56904,1,5.0
76593,1,5.0


In [6]:
df_ratings = df_ratings.groupby(df_ratings.columns.to_list(),as_index=False).size()
df_ratings

Unnamed: 0,movieId,rating,size
0,1,1.0,4
1,1,1.5,3
2,1,2.0,13
3,1,2.5,4
4,1,3.0,41
...,...,...,...
28670,161944,5.0,1
28671,162376,4.5,1
28672,162542,5.0,1
28673,162672,3.0,1


In [7]:
m = df_ratings.loc[df_ratings["movieId"]==1]
m

Unnamed: 0,movieId,rating,size
0,1,1.0,4
1,1,1.5,3
2,1,2.0,13
3,1,2.5,4
4,1,3.0,41
5,1,3.5,23
6,1,4.0,77
7,1,4.5,19
8,1,5.0,63


In [8]:
total_num = m["size"].sum()

In [9]:
p=0
for _, row in m.iterrows():
    x = row["rating"]*row["size"]
    p = p+x
z = p/total_num
print(z)
m["rating_wt_avg"] = z

3.8724696356275303


In [10]:
m

Unnamed: 0,movieId,rating,size,rating_wt_avg
0,1,1.0,4,3.87247
1,1,1.5,3,3.87247
2,1,2.0,13,3.87247
3,1,2.5,4,3.87247
4,1,3.0,41,3.87247
5,1,3.5,23,3.87247
6,1,4.0,77,3.87247
7,1,4.5,19,3.87247
8,1,5.0,63,3.87247


In [11]:
m.drop_duplicates("movieId", inplace=True)

In [12]:
m.drop(labels=["rating","size"], axis=1)

Unnamed: 0,movieId,rating_wt_avg
0,1,3.87247


In [13]:
total_sum=0
p=0
for _, row in df_ratings.iterrows():
    if (row["movieId"])==(row["movieId"]+1):
        total_sum = total_sum + row["size"]
        x = row["rating"]*row["size"]
        p = p+x
        z = p/total_sum
        df_ratings["rating_wt_avg"] = z
    else:
        p=0
        total_sum = 0

In [14]:
for i in df_ratings["movieId"]:
    if i==i+1:
        print(i)
    else:
        break

In [15]:
df_ratings

Unnamed: 0,movieId,rating,size
0,1,1.0,4
1,1,1.5,3
2,1,2.0,13
3,1,2.5,4
4,1,3.0,41
...,...,...,...
28670,161944,5.0,1
28671,162376,4.5,1
28672,162542,5.0,1
28673,162672,3.0,1


In [16]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [17]:
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9125 non-null   int64  
 1   imdbId   9125 non-null   int64  
 2   tmdbId   9112 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 214.0 KB


In [18]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [19]:
df_keywords.rename(columns={"id":"tmdbId"}, inplace=True)
df_keywords

Unnamed: 0,tmdbId,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...
46414,439050,"[{'id': 10703, 'name': 'tragic love'}]"
46415,111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46416,67758,[]
46417,227506,[]


In [20]:
df_keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tmdbId    46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [21]:
df_movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [22]:
df_movies.rename(columns={"id":"tmdbId","imdb_id":"imdbId"}, inplace=True)
df_movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,tmdbId,imdbId,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [23]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   tmdbId                 45466 non-null  object 
 6   imdbId                 45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [24]:
df_movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'tmdbId', 'imdbId', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [25]:
df_movies = df_movies[["genres","overview","tagline","title","tmdbId"]]

In [26]:
df_movies.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862


In [27]:
df_movies.fillna("", inplace=True)
df_movies.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862


In [28]:
#pd.to_numeric(df_movies["tmdbId"])

In [29]:
df_movies.drop(df_movies[df_movies["tmdbId"]=="1997-08-20"].index,axis=0, inplace= True)
df_movies.drop(df_movies[df_movies["tmdbId"]=="2012-09-29"].index,axis=0, inplace= True)
df_movies.drop(df_movies[df_movies["tmdbId"]=="2014-01-01"].index,axis=0, inplace= True)
df_movies["tmdbId"] = pd.to_numeric(df_movies["tmdbId"])

In [30]:
df_movies["tmdbId"] = pd.to_numeric(df_movies["tmdbId"])

In [31]:
df_movies.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862


In [32]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genres    45463 non-null  object
 1   overview  45463 non-null  object
 2   tagline   45463 non-null  object
 3   title     45463 non-null  object
 4   tmdbId    45463 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [33]:
df_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [34]:
df_credits.rename(columns={"id":"tmdbId"}, inplace=True)
df_credits 

Unnamed: 0,cast,crew,tmdbId
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


In [35]:
df = pd.merge(df_movies, df_keywords, on="tmdbId")

In [36]:
df.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId,keywords
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [37]:
df = pd.merge(df, df_credits, on="tmdbId")
df.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId,keywords,cast,crew
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [38]:
df =pd.merge(df,df_links,on="tmdbId")
df.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId,keywords,cast,crew,movieId,imdbId
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1,114709
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",2,113497
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",3,113228
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",4,114885
4,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",5,113041


In [39]:
df["genres"][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [40]:
df["keywords"][0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [41]:
df["cast"][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [42]:
df["crew"][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [43]:
ast.literal_eval(df["crew"][0])

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

In [44]:
def extract_genres_name(column_name):
    name = []
    for row in ast.literal_eval(column_name):
        name.append(row["name"])
        break
    return(name)    
df["genre"] = df["genres"].apply(extract_genres_name)
df["genre"] = df["genre"].apply(lambda x : " ".join(x))

In [45]:
def name_extract_genres(column_name):
    name = ["genre","of","movie","are"]
    for row in ast.literal_eval(column_name):
        name.append(row["name"])
    return(name)

In [46]:
df["genres"] = df["genres"].apply(name_extract_genres)
df.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId,keywords,cast,crew,movieId,imdbId,genre
0,"[genre, of, movie, are, Animation, Comedy, Fam...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1,114709,Animation
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",2,113497,Adventure
2,"[genre, of, movie, are, Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",3,113228,Romance
3,"[genre, of, movie, are, Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",4,114885,Comedy
4,"[genre, of, movie, are, Comedy]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",5,113041,Comedy


In [47]:
def name_extract_keywords(column_name):
    name = []
    for row in ast.literal_eval(column_name):
        name.append(row["name"])
    return(name)

In [48]:
df["keywords"] = df["keywords"].apply(name_extract_keywords)
df.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId,keywords,cast,crew,movieId,imdbId,genre
0,"[genre, of, movie, are, Animation, Comedy, Fam...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862,"[jealousy, toy, boy, friendship, friends, riva...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1,114709,Animation
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844,"[board game, disappearance, based on children'...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",2,113497,Adventure
2,"[genre, of, movie, are, Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602,"[fishing, best friend, duringcreditsstinger, o...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",3,113228,Romance
3,"[genre, of, movie, are, Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357,"[based on novel, interracial relationship, sin...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",4,114885,Comedy
4,"[genre, of, movie, are, Comedy]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862,"[baby, midlife crisis, confidence, aging, daug...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",5,113041,Comedy


In [49]:
def extract_director_name(column_name):
    name = []
    for row in ast.literal_eval(column_name):
        if row["job"]=="Director":
            name.append(row["name"])
    return(name)
df["director"] = df["crew"].apply(extract_director_name)
df["director"] = df["director"].apply(lambda x : " ".join(x))

In [50]:
def name_extract_director(column_name):
    name = ["movie","directed","by"]
    for row in ast.literal_eval(column_name):
        if row["job"]=="Director":
            name.append(row["name"])
    return(name)

In [51]:
df["crew"] = df["crew"].apply(name_extract_director)
df.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId,keywords,cast,crew,movieId,imdbId,genre,director
0,"[genre, of, movie, are, Animation, Comedy, Fam...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862,"[jealousy, toy, boy, friendship, friends, riva...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[movie, directed, by, John Lasseter]",1,114709,Animation,John Lasseter
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844,"[board game, disappearance, based on children'...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[movie, directed, by, Joe Johnston]",2,113497,Adventure,Joe Johnston
2,"[genre, of, movie, are, Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602,"[fishing, best friend, duringcreditsstinger, o...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[movie, directed, by, Howard Deutch]",3,113228,Romance,Howard Deutch
3,"[genre, of, movie, are, Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357,"[based on novel, interracial relationship, sin...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[movie, directed, by, Forest Whitaker]",4,114885,Comedy,Forest Whitaker
4,"[genre, of, movie, are, Comedy]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862,"[baby, midlife crisis, confidence, aging, daug...","[{'cast_id': 1, 'character': 'George Banks', '...","[movie, directed, by, Charles Shyer]",5,113041,Comedy,Charles Shyer


In [52]:
def extract_name_top_5_actor(column_name):
    name = []
    for row in ast.literal_eval(column_name):
            name.append(row["name"])
            break
    return(name)
df["lead_role"] = df["cast"].apply(extract_name_top_5_actor)
df["lead_role"] = df["lead_role"].apply(lambda x : " ".join(x))

In [53]:
def name_extract_top_5_actor(column_name):
    name = ["top","five","actors","and","actresses","in","movie","are"]
    count = 0
    for row in ast.literal_eval(column_name):
        if count<5:
            name.append(row["name"])
            count += 1
    return(name)

In [54]:
name_extract_top_5_actor(df["cast"][0])

['top',
 'five',
 'actors',
 'and',
 'actresses',
 'in',
 'movie',
 'are',
 'Tom Hanks',
 'Tim Allen',
 'Don Rickles',
 'Jim Varney',
 'Wallace Shawn']

In [55]:
df["cast"] = df["cast"].apply(name_extract_top_5_actor)
df.head()

Unnamed: 0,genres,overview,tagline,title,tmdbId,keywords,cast,crew,movieId,imdbId,genre,director,lead_role
0,"[genre, of, movie, are, Animation, Comedy, Fam...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,862,"[jealousy, toy, boy, friendship, friends, riva...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, John Lasseter]",1,114709,Animation,John Lasseter,Tom Hanks
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,8844,"[board game, disappearance, based on children'...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Joe Johnston]",2,113497,Adventure,Joe Johnston,Robin Williams
2,"[genre, of, movie, are, Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,15602,"[fishing, best friend, duringcreditsstinger, o...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Howard Deutch]",3,113228,Romance,Howard Deutch,Walter Matthau
3,"[genre, of, movie, are, Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,31357,"[based on novel, interracial relationship, sin...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Forest Whitaker]",4,114885,Comedy,Forest Whitaker,Whitney Houston
4,"[genre, of, movie, are, Comedy]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,11862,"[baby, midlife crisis, confidence, aging, daug...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Charles Shyer]",5,113041,Comedy,Charles Shyer,Steve Martin


In [56]:
df.drop(columns=["tmdbId","imdbId"], inplace=True)
df.head()

Unnamed: 0,genres,overview,tagline,title,keywords,cast,crew,movieId,genre,director,lead_role
0,"[genre, of, movie, are, Animation, Comedy, Fam...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, John Lasseter]",1,Animation,John Lasseter,Tom Hanks
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,"[board game, disappearance, based on children'...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Joe Johnston]",2,Adventure,Joe Johnston,Robin Williams
2,"[genre, of, movie, are, Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[fishing, best friend, duringcreditsstinger, o...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Howard Deutch]",3,Romance,Howard Deutch,Walter Matthau
3,"[genre, of, movie, are, Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,"[based on novel, interracial relationship, sin...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Forest Whitaker]",4,Comedy,Forest Whitaker,Whitney Houston
4,"[genre, of, movie, are, Comedy]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[baby, midlife crisis, confidence, aging, daug...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, Charles Shyer]",5,Comedy,Charles Shyer,Steve Martin


In [57]:
df["genres"] = df["genres"].apply(lambda join_spaces :[spaces.replace(" ","") for spaces in join_spaces])
df["keywords"] = df["keywords"].apply(lambda join_spaces :[spaces.replace(" ","") for spaces in join_spaces])
df["cast"] = df["cast"].apply(lambda join_spaces :[spaces.replace(" ","") for spaces in join_spaces])
df["crew"] = df["crew"].apply(lambda join_spaces :[spaces.replace(" ","") for spaces in join_spaces])
df.head()

Unnamed: 0,genres,overview,tagline,title,keywords,cast,crew,movieId,genre,director,lead_role
0,"[genre, of, movie, are, Animation, Comedy, Fam...","Led by Woody, Andy's toys live happily in his ...",,Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, JohnLasseter]",1,Animation,John Lasseter,Tom Hanks
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,"[boardgame, disappearance, basedonchildren'sbo...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, JoeJohnston]",2,Adventure,Joe Johnston,Robin Williams
2,"[genre, of, movie, are, Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[fishing, bestfriend, duringcreditsstinger, ol...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, HowardDeutch]",3,Romance,Howard Deutch,Walter Matthau
3,"[genre, of, movie, are, Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,"[basedonnovel, interracialrelationship, single...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, ForestWhitaker]",4,Comedy,Forest Whitaker,Whitney Houston
4,"[genre, of, movie, are, Comedy]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[baby, midlifecrisis, confidence, aging, daugh...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, CharlesShyer]",5,Comedy,Charles Shyer,Steve Martin


In [58]:
df["overview"]=df["overview"].apply(lambda x: x.strip('()').split(','))
df["tagline"]=df["tagline"].apply(lambda x: x.strip('()').split(','))
df.head()

Unnamed: 0,genres,overview,tagline,title,keywords,cast,crew,movieId,genre,director,lead_role
0,"[genre, of, movie, are, Animation, Comedy, Fam...","[Led by Woody, Andy's toys live happily in hi...",[],Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, JohnLasseter]",1,Animation,John Lasseter,Tom Hanks
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",[When siblings Judy and Peter discover an ench...,[Roll the dice and unleash the excitement!],Jumanji,"[boardgame, disappearance, basedonchildren'sbo...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, JoeJohnston]",2,Adventure,Joe Johnston,Robin Williams
2,"[genre, of, movie, are, Romance, Comedy]",[A family wedding reignites the ancient feud b...,[Still Yelling. Still Fighting. Still Ready fo...,Grumpier Old Men,"[fishing, bestfriend, duringcreditsstinger, ol...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, HowardDeutch]",3,Romance,Howard Deutch,Walter Matthau
3,"[genre, of, movie, are, Comedy, Drama, Romance]","[Cheated on, mistreated and stepped on, the ...",[Friends are the people who let you be yoursel...,Waiting to Exhale,"[basedonnovel, interracialrelationship, single...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, ForestWhitaker]",4,Comedy,Forest Whitaker,Whitney Houston
4,"[genre, of, movie, are, Comedy]",[Just when George Banks has recovered from his...,[Just When His World Is Back To Normal... He's...,Father of the Bride Part II,"[baby, midlifecrisis, confidence, aging, daugh...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, CharlesShyer]",5,Comedy,Charles Shyer,Steve Martin


In [59]:
df["information"] = df["cast"] + df["overview"] + df["tagline"] + df["genres"] + df["crew"] + df["keywords"]
df.head()

Unnamed: 0,genres,overview,tagline,title,keywords,cast,crew,movieId,genre,director,lead_role,information
0,"[genre, of, movie, are, Animation, Comedy, Fam...","[Led by Woody, Andy's toys live happily in hi...",[],Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, JohnLasseter]",1,Animation,John Lasseter,Tom Hanks,"[top, five, actors, and, actresses, in, movie,..."
1,"[genre, of, movie, are, Adventure, Fantasy, Fa...",[When siblings Judy and Peter discover an ench...,[Roll the dice and unleash the excitement!],Jumanji,"[boardgame, disappearance, basedonchildren'sbo...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, JoeJohnston]",2,Adventure,Joe Johnston,Robin Williams,"[top, five, actors, and, actresses, in, movie,..."
2,"[genre, of, movie, are, Romance, Comedy]",[A family wedding reignites the ancient feud b...,[Still Yelling. Still Fighting. Still Ready fo...,Grumpier Old Men,"[fishing, bestfriend, duringcreditsstinger, ol...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, HowardDeutch]",3,Romance,Howard Deutch,Walter Matthau,"[top, five, actors, and, actresses, in, movie,..."
3,"[genre, of, movie, are, Comedy, Drama, Romance]","[Cheated on, mistreated and stepped on, the ...",[Friends are the people who let you be yoursel...,Waiting to Exhale,"[basedonnovel, interracialrelationship, single...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, ForestWhitaker]",4,Comedy,Forest Whitaker,Whitney Houston,"[top, five, actors, and, actresses, in, movie,..."
4,"[genre, of, movie, are, Comedy]",[Just when George Banks has recovered from his...,[Just When His World Is Back To Normal... He's...,Father of the Bride Part II,"[baby, midlifecrisis, confidence, aging, daugh...","[top, five, actors, and, actresses, in, movie,...","[movie, directed, by, CharlesShyer]",5,Comedy,Charles Shyer,Steve Martin,"[top, five, actors, and, actresses, in, movie,..."


In [60]:
df_new = df[["movieId","title","lead_role","director","genre","information"]] 

In [61]:
df_new

Unnamed: 0,movieId,title,lead_role,director,genre,information
0,1,Toy Story,Tom Hanks,John Lasseter,Animation,"[top, five, actors, and, actresses, in, movie,..."
1,2,Jumanji,Robin Williams,Joe Johnston,Adventure,"[top, five, actors, and, actresses, in, movie,..."
2,3,Grumpier Old Men,Walter Matthau,Howard Deutch,Romance,"[top, five, actors, and, actresses, in, movie,..."
3,4,Waiting to Exhale,Whitney Houston,Forest Whitaker,Comedy,"[top, five, actors, and, actresses, in, movie,..."
4,5,Father of the Bride Part II,Steve Martin,Charles Shyer,Comedy,"[top, five, actors, and, actresses, in, movie,..."
...,...,...,...,...,...,...
9214,161944,The Last Brickmaker in America,Sidney Poitier,Gregg Champion,Drama,"[top, five, actors, and, actresses, in, movie,..."
9215,162542,Rustom,Akshay Kumar,Tinu Suresh Desai,Thriller,"[top, five, actors, and, actresses, in, movie,..."
9216,162672,Mohenjo Daro,Hrithik Roshan,Ashutosh Gowariker,Adventure,"[top, five, actors, and, actresses, in, movie,..."
9217,163056,Shin Godzilla,Hiroki Hasegawa,Hideaki Anno Shinji Higuchi,Action,"[top, five, actors, and, actresses, in, movie,..."


In [62]:
df_new["information"] = df_new["information"].apply(lambda x : " ".join(x))
df_new.head()

Unnamed: 0,movieId,title,lead_role,director,genre,information
0,1,Toy Story,Tom Hanks,John Lasseter,Animation,top five actors and actresses in movie are Tom...
1,2,Jumanji,Robin Williams,Joe Johnston,Adventure,top five actors and actresses in movie are Rob...
2,3,Grumpier Old Men,Walter Matthau,Howard Deutch,Romance,top five actors and actresses in movie are Wal...
3,4,Waiting to Exhale,Whitney Houston,Forest Whitaker,Comedy,top five actors and actresses in movie are Whi...
4,5,Father of the Bride Part II,Steve Martin,Charles Shyer,Comedy,top five actors and actresses in movie are Ste...


In [63]:
df_new["information"][0]

"top five actors and actresses in movie are TomHanks TimAllen DonRickles JimVarney WallaceShawn Led by Woody  Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart  Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner  the duo eventually learns to put aside their differences.  genre of movie are Animation Comedy Family movie directed by JohnLasseter jealousy toy boy friendship friends rivalry boynextdoor newtoy toycomestolife"

In [64]:
df_new["information"] = df_new["information"].apply(lambda x : x.lower())

In [65]:
df_new["information"][0]

"top five actors and actresses in movie are tomhanks timallen donrickles jimvarney wallaceshawn led by woody  andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart  woody plots against buzz. but when circumstances separate buzz and woody from their owner  the duo eventually learns to put aside their differences.  genre of movie are animation comedy family movie directed by johnlasseter jealousy toy boy friendship friends rivalry boynextdoor newtoy toycomestolife"

In [66]:
stemmer = PorterStemmer()
def root_word(doc):
    l = []
    for word in doc.split():
        l.append(stemmer.stem(word))
    stemmed_doc = " ".join(l)
    return(stemmed_doc)

In [67]:
df_new["information"] = df_new["information"].apply(root_word)

In [68]:
df_new.head()

Unnamed: 0,movieId,title,lead_role,director,genre,information
0,1,Toy Story,Tom Hanks,John Lasseter,Animation,top five actor and actress in movi are tomhank...
1,2,Jumanji,Robin Williams,Joe Johnston,Adventure,top five actor and actress in movi are robinwi...
2,3,Grumpier Old Men,Walter Matthau,Howard Deutch,Romance,top five actor and actress in movi are walterm...
3,4,Waiting to Exhale,Whitney Houston,Forest Whitaker,Comedy,top five actor and actress in movi are whitney...
4,5,Father of the Bride Part II,Steve Martin,Charles Shyer,Comedy,top five actor and actress in movi are stevema...


### Model Building

In [69]:
cv = CountVectorizer(max_features=10000, stop_words="english")
vector = cv.fit_transform(df_new["information"]).toarray()

In [70]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [71]:
len(df_new["title"])

9219

In [72]:
similar_movies =  cosine_similarity(vector)

In [73]:
similar_movies

array([[1.        , 0.21081851, 0.20643071, ..., 0.27467513, 0.17011277,
        0.16230861],
       [0.21081851, 1.        , 0.23210354, ..., 0.31180478, 0.20690147,
        0.18330889],
       [0.20643071, 0.23210354, 1.        , ..., 0.37219368, 0.20169477,
        0.16593191],
       ...,
       [0.27467513, 0.31180478, 0.37219368, ..., 1.        , 0.33178047,
        0.25475509],
       [0.17011277, 0.20690147, 0.20169477, ..., 0.33178047, 1.        ,
        0.16991242],
       [0.16230861, 0.18330889, 0.16593191, ..., 0.25475509, 0.16991242,
        1.        ]])

In [74]:
similar_movies.shape

(9219, 9219)

In [75]:
similar_movies[0]

array([1.        , 0.21081851, 0.20643071, ..., 0.27467513, 0.17011277,
       0.16230861])

In [76]:
for i in similar_movies:
    list(enumerate(i))

In [77]:
x = enumerate(similar_movies[0])
p = list(x)

In [78]:
p

[(0, 1.0),
 (1, 0.21081851067789198),
 (2, 0.20643070596119262),
 (3, 0.2457180467335805),
 (4, 0.2785033259480083),
 (5, 0.1532064692570853),
 (6, 0.27279773578818944),
 (7, 0.2355407651655962),
 (8, 0.16940773179473462),
 (9, 0.21429865385363078),
 (10, 0.22019275302527214),
 (11, 0.30338993810845893),
 (12, 0.2661453237111885),
 (13, 0.2521038239062807),
 (14, 0.16672148221383754),
 (15, 0.264575131106459),
 (16, 0.28653494154687786),
 (17, 0.2691909510290827),
 (18, 0.21833344115234327),
 (19, 0.26967994498529685),
 (20, 0.23138673905141918),
 (21, 0.18763883748662838),
 (22, 0.20830127958541944),
 (23, 0.23904572186687872),
 (24, 0.2138089935299395),
 (25, 0.32024493467004),
 (26, 0.2253744679276044),
 (27, 0.1936491673103708),
 (28, 0.24956859329374897),
 (29, 0.23395480008935163),
 (30, 0.1926686588971131),
 (31, 0.1499113213046938),
 (32, 0.22697729311273637),
 (33, 0.17954621161490195),
 (34, 0.22786635759382495),
 (35, 0.2734570552425863),
 (36, 0.23488808780588138),
 (37, 0.

In [79]:
sorted(p,key = lambda x:x[1], reverse=True)

[(0, 1.0),
 (2522, 0.5448919310238),
 (7629, 0.5096471914376255),
 (3797, 0.41714884381325673),
 (1609, 0.41504894289709976),
 (8056, 0.40414518843273806),
 (9118, 0.40414518843273806),
 (8921, 0.4),
 (8245, 0.3952847075210474),
 (691, 0.3884492980336779),
 (8861, 0.3884492980336779),
 (9036, 0.3884492980336779),
 (2174, 0.3834824944236852),
 (8707, 0.38087503130775613),
 (7792, 0.3803194146278325),
 (1186, 0.3796283011826483),
 (8463, 0.37527767497325676),
 (6161, 0.3730019232961255),
 (7080, 0.37274965820548744),
 (7617, 0.3726779962499649),
 (1676, 0.37116090401154417),
 (8758, 0.3711609040115441),
 (848, 0.37080992435478316),
 (8519, 0.3694942137922785),
 (1430, 0.3689323936863109),
 (845, 0.36742346141747667),
 (8566, 0.36742346141747667),
 (1757, 0.3633610463437158),
 (5225, 0.3633610463437158),
 (7392, 0.3633610463437158),
 (9121, 0.3633610463437158),
 (721, 0.3618136134933163),
 (2210, 0.3614784456460255),
 (5949, 0.36144869800612456),
 (7587, 0.3590924232298039),
 (3092, 0.359

In [80]:
q = enumerate(similar_movies)
list(q)

[(0,
  array([1.        , 0.21081851, 0.20643071, ..., 0.27467513, 0.17011277,
         0.16230861])),
 (1,
  array([0.21081851, 1.        , 0.23210354, ..., 0.31180478, 0.20690147,
         0.18330889])),
 (2,
  array([0.20643071, 0.23210354, 1.        , ..., 0.37219368, 0.20169477,
         0.16593191])),
 (3,
  array([0.24571805, 0.21044527, 0.25361875, ..., 0.3893806 , 0.22507573,
         0.18516715])),
 (4,
  array([0.27850333, 0.18681618, 0.2090605 , ..., 0.29957234, 0.1987845 ,
         0.18785865])),
 (5,
  array([0.15320647, 0.1987616 , 0.16867477, ..., 0.27888668, 0.1850583 ,
         0.1420956 ])),
 (6,
  array([0.27279774, 0.26958193, 0.28156915, ..., 0.40347329, 0.24988051,
         0.25301384])),
 (7,
  array([0.23554077, 0.2206949 , 0.20169477, ..., 0.37601786, 0.23287671,
         0.15777582])),
 (8,
  array([0.16940773, 0.27380952, 0.16164354, ..., 0.26726124, 0.21281294,
         0.13617232])),
 (9,
  array([0.21429865, 0.24326682, 0.23593505, ..., 0.39009475, 0.2761

### Recommendation (Output of model) 

In [81]:
def movie_recommend_on_movie_name(movie_name):
    movieid = df_new[df_new["title"] == movie_name].index[0] 
    distance = similar_movies[movieid]
    sorted_distance = sorted(list(enumerate(distance)), reverse=True, key = lambda x:x[1])[1:26]
    
    for i in sorted_distance:
        print(df_new.iloc[i[0]].title)

In [89]:
movie_recommend_on_movie_name("Iron Man")

Iron Man 3
The Three Musketeers
Iron Man 2
İtirazım Var
The Day the Sun Turned Cold
Off Beat
El vals de los inútiles
The Mark of Zorro
Descongelate!
The Forsyte Saga
Bana Masal Anlatma
28 Up
Avengers: Age of Ultron
Kaspar Hauser
Guantanamera
Pek Yakında
Disaster Movie
Storefront Hitchcock
Slaves of New York
Guardians of the Galaxy
Superhero Movie
Babylon 5: A Call to Arms
Hamlet
Steam of Life
Jazz


In [83]:
def movie_recommend_on_director_name(director_name):
    movieid = df_new[df_new["director"] == director_name].index[0] 
    distance = similar_movies[movieid]
    sorted_distance = sorted(list(enumerate(distance)), reverse=True, key = lambda x:x[1])[1:26]
    
    for i in sorted_distance:
        print(df_new.iloc[i[0]].title)

In [84]:
movie_recommend_on_director_name("Ashutosh Gowariker")

Mohenjo Daro
Off Beat
Guantanamera
The Indian in the Cupboard
Descongelate!
Bana Masal Anlatma
Ismael
Samurai III: Duel at Ganryu Island
The Last Detail
The Three Musketeers
İtirazım Var
The Day the Sun Turned Cold
El vals de los inútiles
Steam of Life
Kirikou and the Sorceress
Certified Copy
Fados
Can't Stop the Music
The Wind Will Carry Us
The Mark of Zorro
Kaspar Hauser
Disaster Movie
The Forsyte Saga
Mississippi Masala
Paul Williams Still Alive


In [85]:
def movie_recommend_on_actor_name(actor_name):
    movieid = df_new[df_new["lead_role"] == actor_name].index[0] 
    distance = similar_movies[movieid]
    sorted_distance = sorted(list(enumerate(distance)), reverse=True, key = lambda x:x[1])[1:26]
    
    for i in sorted_distance:
        print(df_new.iloc[i[0]].title)

In [86]:
movie_recommend_on_actor_name("Tom Cruise")

The Limits of Control
The Inheritance
Wrong
The Truman Show
Colonel Chabert
I Origins
Fullmetal Alchemist the Movie: Conqueror of Shamballa
Honeysuckle Rose
Premature
Stardust Memories
Greenberg
How I Killed My Father
Song of the South
How to Get Ahead in Advertising
Nothing in Common
Easy Money
The Closet
Synecdoche, New York
Love and a Bullet
Hesher
A Fantastic Fear of Everything
Sexy Beast
Interview with the Assassin
Mirror
J. Edgar


In [87]:
def movie_recommend_on_genre_name(genre_name):
    movieid = df_new[df_new["genre"] == genre_name].index[0] 
    distance = similar_movies[movieid]
    sorted_distance = sorted(list(enumerate(distance)), reverse=True, key = lambda x:x[1])[1:26]
    
    for i in sorted_distance:
        print(df_new.iloc[i[0]].title)

In [88]:
movie_recommend_on_genre_name("Action")

İtirazım Var
Night Moves
Kiss Kiss Bang Bang
Insomnia
Number Seventeen
Le Cercle Rouge
The Getaway
The Score
Off Beat
Bob le Flambeur
Read My Lips
Nightfall
Blood and Wine
The Late Show
The Dancer Upstairs
Descongelate!
Bana Masal Anlatma
The Long Goodbye
Matador
Little Caesar
Possessed
The Three Musketeers
Drive Hard
T-Men
The Day the Sun Turned Cold
