# Movie Recommendation Project

## Data Exploration

In [1]:
import pandas as pd
from pprint import PrettyPrinter

In [2]:
pp = PrettyPrinter(indent=4)

## Loading the Data

In [3]:
#Checking if I have the files I need in my folder
!ls -l 'tmdb_5000_movies.csv' 'tmdb_5000_credits.csv'

-rw-r--r--@ 1 subhantariq  staff  40044293 Apr 25 15:40 tmdb_5000_credits.csv
-rw-r--r--@ 1 subhantariq  staff   5722964 Apr 25 15:40 tmdb_5000_movies.csv


In [4]:
# Load data files
movies_df_all = pd.read_csv('tmdb_5000_movies.csv')
credits_df_all = pd.read_csv('tmdb_5000_credits.csv')

## Exploring the Data

In [5]:
movies_df_all.shape

(4803, 20)

In [6]:
movies_df_all.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [7]:
movies_df_all.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [8]:
#Now we will explore the other datafile (credits file)
credits_df_all.shape

(4803, 4)

In [9]:
credits_df_all.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [10]:
credits_df_all.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

## Feature Selection

In [11]:
movies_df = movies_df_all[['title','overview','genres','keywords','production_companies',\
                           'release_date','original_language']]

In [12]:
movies_df.shape

(4803, 7)

In [13]:
movies_df.head()

Unnamed: 0,title,overview,genres,keywords,production_companies,release_date,original_language
0,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...",2009-12-10,en
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",2007-05-19,en
2,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",2015-10-26,en
3,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",2012-07-16,en
4,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",2012-03-07,en


In [14]:
credits_df = credits_df_all[['title','cast','crew']]

In [15]:
credits_df.head()

Unnamed: 0,title,cast,crew
0,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Preprocessing

In [16]:
# Checking for Missing Values in movies data
movies_df.isnull().sum()
# Note: overview has 31 missing values, release_date has 1 missing value

title                    0
overview                31
genres                   0
keywords                 0
production_companies     0
release_date             1
original_language        0
dtype: int64

In [17]:
# Displaying rows where overview is null
movies_df[movies_df['overview'].isnull()]

Unnamed: 0,title,overview,genres,keywords,production_companies,release_date,original_language
65,The Dark Knight,,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 28, ""name...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""name"": ""DC Comics"", ""id"": 429}, {""name"": ""L...",2008-07-16,en
77,Inside Out,,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...","[{""id"": 1566, ""name"": ""dream""}, {""id"": 6513, ""...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",2015-06-09,en
94,Guardians of the Galaxy,,"[{""id"": 28, ""name"": ""Action""}, {""id"": 878, ""na...","[{""id"": 8828, ""name"": ""marvel comic""}, {""id"": ...","[{""name"": ""Marvel Studios"", ""id"": 420}, {""name...",2014-07-30,en
95,Interstellar,,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 18, ""...","[{""id"": 83, ""name"": ""saving the world""}, {""id""...","[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...",2014-11-05,en
96,Inception,,"[{""id"": 28, ""name"": ""Action""}, {""id"": 53, ""nam...","[{""id"": 1014, ""name"": ""loss of lover""}, {""id"":...","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",2010-07-14,en
262,The Lord of the Rings: The Fellowship of the Ring,,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 603, ""name"": ""elves""}, {""id"": 604, ""na...","[{""name"": ""WingNut Films"", ""id"": 11}, {""name"":...",2001-12-18,en
287,Django Unchained,,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 37, ""name...","[{""id"": 801, ""name"": ""bounty hunter""}, {""id"": ...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",2012-12-25,en
298,The Wolf of Wall Street,,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 18, ""name...","[{""id"": 417, ""name"": ""corruption""}, {""id"": 572...","[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...",2013-12-25,en
329,The Lord of the Rings: The Return of the King,,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 603, ""name"": ""elves""}, {""id"": 606, ""na...","[{""name"": ""WingNut Films"", ""id"": 11}, {""name"":...",2003-12-01,en
330,The Lord of the Rings: The Two Towers,,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 603, ""name"": ""elves""}, {""id"": 606, ""na...","[{""name"": ""WingNut Films"", ""id"": 11}, {""name"":...",2002-12-18,en


In [18]:
# to fill these values we download a new kaggle dataset 
outsource = pd.read_csv('imdb_top_1000.csv')
outsource.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [19]:
# Only using relevent columns
outsource = outsource[['Series_Title','Overview']]

In [20]:
# Renaming the outside_source to correspond to our dataset
outsource.rename(columns = {'Series_Title':'title'}, inplace = True)
outsource.rename(columns = {'Overview':'overview'}, inplace = True)

In [21]:
# Using Pandas merge feature to add the new values
movies_df = pd.merge(movies_df, outsource, on='title', how='left', suffixes=('_main', '_missing'))
movies_df['overview_main'].fillna(movies_df['overview_missing'], inplace=True)

In [22]:
#Drop merged column
movies_df.drop(['overview_missing'], axis=1, inplace = True)
#renaming it make to 'overview'
movies_df.rename(columns = {'overview_main':'overview'}, inplace = True)

In [23]:
# Checking what titles are still left as null
movies_df[movies_df['overview'].isnull()]

Unnamed: 0,title,overview,genres,keywords,production_companies,release_date,original_language
1990,The Empire Strikes Back,,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...","[{""id"": 526, ""name"": ""rebel""}, {""id"": 803, ""na...","[{""name"": ""Lucasfilm"", ""id"": 1}, {""name"": ""Twe...",1980-05-17,en
2294,Spirited Away,,"[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 12, ""na...","[{""id"": 616, ""name"": ""witch""}, {""id"": 970, ""na...","[{""name"": ""Studio Ghibli"", ""id"": 10342}]",2001-07-20,ja
2656,Chiamatemi Francesco - Il Papa della gente,,"[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...","[{""name"": ""Taodue Film"", ""id"": 45724}]",2015-12-03,it
4140,"To Be Frank, Sinatra at 100",,"[{""id"": 99, ""name"": ""Documentary""}]","[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...","[{""name"": ""Eyeline Entertainment"", ""id"": 60343}]",2015-12-12,en
4431,Food Chains,,"[{""id"": 99, ""name"": ""Documentary""}]",[],[],2014-04-26,de


In [24]:
# For these 5 overviews I decided to use brute force
# Note these overview are from
# https://www.imdb.com/title/tt0080684/
# https://www.imdb.com/title/tt0245429/
# https://www.imdb.com/title/tt3856124/plotsummary/?ref_=tt_stry_pl
# https://www.imdb.com/title/tt4704314/
# https://www.imdb.com/title/tt2141739/?ref_=ttpl_ov
movies_df['overview'][1990] = 'After the Rebels are overpowered by the Empire, Luke Skywalker begins his Jedi training with Yoda, while his friends are pursued across the galaxy by Darth Vader and bounty hunter Boba Fett.'
movies_df['overview'][2294] = "During her family's move to the suburbs, a sullen 10-year-old girl wanders into a world ruled by gods, witches and spirits, a world where humans are changed into beasts."
movies_df['overview'][2656] = 'A biopic of the rise of father Jorge Mario Bergoglio SJ from a teacher in a Jesuit High School in Argentina to archbishop and cardinal of Buenos Aires to Pope of the Roman Catholic Church. The story touches on his relation with his fellow Jesuits in Argentina and Europe, to his relation with laureate writer Jorge Luis Borges, Argentine dictator Jorge Rafael Videla, and archbishops Laghi (nuncio to Argentina) and Quarracino (cardinal of Buenos Aires), up to the moment where he is elected Pope in 2013.'
movies_df['overview'][4140] = 'The life of Frank Sinatra, as an actor and singer and the steps along the way that led him to become such an icon.'
movies_df['overview'][4431] = "There is so much interest in food these days yet there is almost no interest in the hands that pick that food. In the US, farm labor has always been one of the most difficult and poorly paid jobs and has relied on some of the nation's most vulnerable people. While the legal restrictions which kept people bound to farms, like slavery, have been abolished, exploitation still exists, ranging from wage theft to modern-day slavery. These days, this exploitation is perpetuated by the corporations at the top of the food chain: supermarkets. Their buying power has kept wages pitifully low and has created a scenario where desperately poor people are willing to put up with anything to keep their jobs."


In [25]:
# Display rows where release_date is null
movies_df[movies_df['release_date'].isnull()]

Unnamed: 0,title,overview,genres,keywords,production_companies,release_date,original_language
4553,America Is Still the Place,1971 post civil rights San Francisco seemed li...,[],[],[],,en


In [26]:
## Release date obtained from IMDB to replace NaN value
movies_df.loc[movies_df.title == 'America Is Still the Place','release_date'] = '2022-06-10'
movies_df.loc[movies_df.title == 'America Is Still the Place']

Unnamed: 0,title,overview,genres,keywords,production_companies,release_date,original_language
4553,America Is Still the Place,1971 post civil rights San Francisco seemed li...,[],[],[],2022-06-10,en


In [27]:
#Successful removed null values from movies data
movies_df.isnull().sum()

title                   0
overview                0
genres                  0
keywords                0
production_companies    0
release_date            0
original_language       0
dtype: int64

Note: I will get rid of the empty lists after we unpack the columns

In [28]:
#Checking for null values in credits data
credits_df.isnull().sum()

title    0
cast     0
crew     0
dtype: int64

## Data Extraction

In [29]:
# Importing Relevant Libraries
import ast
import nltk
import collections
import operator
from rake_nltk import Rake

### Extract values from a dict for a given key

In [30]:
# Creating a function to extract values from a dict - will be used for desired features 
def get_names(strlist):
    Name = []
    for i in ast.literal_eval(strlist):
        Name.append(i['name'])          # Get the value associated with the key 'name'
    return Name

In [31]:
# The 'crew' feature is formatted differently from the other desired features
# Create unique function to extract director's name from 'crew' feature
def director(strlist):
    director = []
    for i in ast.literal_eval(strlist):
        if i['job'] == 'Director':
            director.append(i['name'])          # Get directors' names
    return director

In [32]:
# create new dataframe to store features during initial cleaning
clean1_df = pd.DataFrame(movies_df['title'])
clean1_df.head()

Unnamed: 0,title
0,Avatar
1,Pirates of the Caribbean: At World's End
2,Spectre
3,The Dark Knight Rises
4,John Carter


In [33]:
# We will be putting features from 'movies' and 'credits' dataframes together. For this to be valid, we would
# initially check if our reference point, i.e. the feature 'title', is the same for both dataframes.

if movies_df['title'].equals(credits_df['title']):
    print("The 'title' columns are the same.")
else:
    print("The 'title' columns are different.")

The 'title' columns are the same.


In [34]:
# applying function to each feature that needs to be unpacked
clean1_df['genres_list'] = movies_df['genres'].apply(get_names)
clean1_df['keywords_list'] = movies_df['keywords'].apply(get_names)
clean1_df['prod_companies_list'] = movies_df['production_companies'].apply(get_names)
clean1_df['cast_list'] = credits_df['cast'].apply(get_names)
clean1_df['director_list'] = credits_df['crew'].apply(director)

In [35]:
pd.set_option('max_colwidth', None)
clean1_df.head()

Unnamed: 0,title,genres_list,keywords_list,prod_companies_list,cast_list,director_list
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe, alien planet, cgi, marine, soldier, battle, love affair, anti war, power relations, mind and soul, 3d]","[Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment, Lightstorm Entertainment]","[Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang, Michelle Rodriguez, Giovanni Ribisi, Joel David Moore, CCH Pounder, Wes Studi, Laz Alonso, Dileep Rao, Matt Gerald, Sean Anthony Moran, Jason Whyte, Scott Lawrence, Kelly Kilgour, James Patrick Pitt, Sean Patrick Murphy, Peter Dillon, Kevin Dorman, Kelson Henderson, David Van Horn, Jacob Tomuri, Michael Blain-Rozgay, Jon Curry, Luke Hawker, Woody Schultz, Peter Mensah, Sonia Yee, Jahnel Curfman, Ilram Choi, Kyla Warren, Lisa Roumain, Debra Wilson, Chris Mala, Taylor Kibby, Jodie Landau, Julie Lamm, Cullen B. Madden, Joseph Brady Madden, Frankie Torres, Austin Wilson, Sara Wilson, Tamica Washington-Miller, Lucy Briant, Nathan Meister, Gerry Blair, Matthew Chamberlain, Paul Yates, Wray Wilson, James Gaylyn, Melvin Leno Clark III, Carvon Futrell, Brandon Jelkes, Micah Moch, Hanniyah Muhammad, Christopher Nolen, Christa Oliver, April Marie Thomas, Bravita A. Threatt, Colin Bleasdale, Mike Bodnar, Matt Clayton, Nicole Dionne, Jamie Harrison, Allan Henry, Anthony Ingruber, Ashley Jeffery, Dean Knowsley, Joseph Mika-Hunt, Terry Notary, Kai Pantano, Logan Pithyou, Stuart Pollock, Raja, Gareth Ruck, Rhian Sheehan, T. J. Storm, Jodie Taylor, Alicia Vela-Bailey, Richard Whiteside, Nikie Zambo, Julene Renee]",[James Cameron]
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, shipwreck, strong woman, ship, alliance, calypso, afterlife, fighter, pirate, swashbuckler, aftercreditsstinger]","[Walt Disney Pictures, Jerry Bruckheimer Films, Second Mate Productions]","[Johnny Depp, Orlando Bloom, Keira Knightley, Stellan Skarsgård, Chow Yun-fat, Bill Nighy, Geoffrey Rush, Jack Davenport, Kevin McNally, Tom Hollander, Naomie Harris, Jonathan Pryce, Keith Richards, Lee Arenberg, Mackenzie Crook, Greg Ellis, David Bailie, Martin Klebba, David Schofield, Lauren Maher, Vanessa Branch, Angus Barnett, Giles New, Reggie Lee, Dominic Scott Kay, Takayo Fischer, David Meunier, Ho-Kwan Tse, Andy Beckwith, Peter Donald Badalamenti II, Christopher S. Capp, Keith Richards, Hakeem Kae-Kazim, Ghassan Massoud]",[Gore Verbinski]
2,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]","[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux, Ralph Fiennes, Monica Bellucci, Ben Whishaw, Naomie Harris, Dave Bautista, Andrew Scott, Rory Kinnear, Jesper Christensen, Alessandro Cremona, Stephanie Sigman, Tenoch Huerta, Adriana Paz, Domenico Fortunato, Marco Zingaro, Stefano Elfi DiClaudia, Ian Bonar, Tam Williams, Richard Banham, Pip Carter, Simon Lenagan, Alessandro Bressanello, Marc Zinga, Brigitte Millar, Adel Bencherif, Gediminas Adomaitis, Peppe Lanzetta, Francesco Arca, Matteo Taranto, Emilio Aniba, Benito Sagredo, Dai Tabuchi, George Lasha, Sargon Yelda, Andy Cheung, Erick Hayden, Oleg Mirochnikov, Antonio Salines, Miloud Mourad Benamara, Gido Schimanski, Nigel Barber, Patrice Naiambana, Stephane Cornicard, Gary Fannin, Sadao Ueda, Phillip Law, Wai Wong, Joseph Balderrama, Eiji Mihara, Junichi Kajioka, Victor Schefé, Harald Windisch, Tristan Matthiae, Detlef Bothe, Bodo Friesecke, Wilhem Iben, Noemi Krausz, Noah Saavedra, Francis Attakpah, Michael Glantschnig, Marlon Boess, Marie Wohlmuth, Lili Epply, Konstantin Gerlach, Lara Parmiani, Umit Ulgen, Amra Mallassi, Ziad Abaza, Walid Mumuni, Derek Horsham, Nari Blair-Mangat, Michael White, Adam McGrady, Nader Dernaika, Pezhmaan Alinia, Judi Dench, Neve Gachev, Karl Farrer, Kim Adis, Maurisa Selene Coleman, Matija Mondi Matović]",[Sam Mendes]
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante, cover-up, superhero, villainess, tragic hero, terrorism, destruction, catwoman, cat burglar, imax, flood, criminal underworld, batman]","[Legendary Pictures, Warner Bros., DC Entertainment, Syncopy]","[Christian Bale, Michael Caine, Gary Oldman, Anne Hathaway, Tom Hardy, Marion Cotillard, Joseph Gordon-Levitt, Morgan Freeman, Cillian Murphy, Juno Temple, Liam Neeson, Matthew Modine, Alon Aboutboul, Ben Mendelsohn, Nestor Carbonell, Josh Pence, Tom Conti, Joey King, Warren Brown, Daniel Sunjata, Sam Kennard, Aliash Tepina, Nick Julian, Miranda Nolan, Claire Julien, Aidan Gillen, Burn Gorman, Brett Cullen, Reggie Lee, Joseph Lyle Taylor, Chris Ellis, Duane Henry, James Harvey Ward, Gonzalo Menendez, Cameron Jack, Lex Daniel, Tyler Dean Flores, Thomas Lennon, Trevor White, Rob Brown, Fredric Lehne, Courtney Munch, Chris Hill, Travis Guba, Jay Benedict, Will Estes, David Dayan Fisher, Glen Powell, Russ Fega, Andres Perez-Molina, Brent Briscoe, John Nolan, Oliver Cotton, Mark Killeen, Sarah Goldberg, John MacMillan, Robert Wisdom, Ronnie Gene Blevins, John Hollingworth, Ian Bohen, Uri Gavriel, Noel Gugliemi, Hector Atreyu Ruiz, Patrick Cox, Aramis Knight, Josh Stewart, William Devane, David Gyasi, Patrick Jordan, Joshua Elijah Reese, Desmond Harrington, Mychael Bates, Tomas Arana, Peter Holden, David Monahan, Jillian Armenante, Aja Evans, Aldous Davidson, Wade Williams, Jake Canuso, Massi Furlan, Christopher Judge, Patrick Leahy, Todd Gearhart, Marc Abbink, Isiah Adams, Charlie Alejandro, Robert Arensen, Grant Babbitt, Fileena Bahris, Rick Bolander, Kyle Patrick Brennan, Scott Churchson, Bill Cowher, Graham Curry, Stephanie Domini Ehlert, John Farrer, Frank Fata, Christopher Bryan Gomez, Vito Grassi, ...]",[Christopher Nolan]
4,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edgar rice burroughs, alien race, superhuman strength, mars civilization, sword and planet, 19th century, 3d]",[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton, Willem Dafoe, Thomas Haden Church, Mark Strong, Ciarán Hinds, Dominic West, James Purefoy, Bryan Cranston, Polly Walker, Daryl Sabara, Arkie Reece, Davood Ghadami, Pippa Nixon, James Embree, Philip Philmar, Emily Tierney, Edmund Kente, Nicholas Woodeson, Kyle Agnew, Don Stark, Josh Daugherty, Jared Cyr, Christopher Goodman, Amanda Clayton, Joseph Billingiere]",[Andrew Stanton]


In [36]:
clean1_df['genres_list'].value_counts()

genres_list
[Drama]                                         370
[Comedy]                                        282
[Drama, Romance]                                164
[Comedy, Romance]                               144
[Comedy, Drama]                                 142
                                               ... 
[Adventure, Action, Comedy, Romance]              1
[Action, Fantasy, Science Fiction, Thriller]      1
[Science Fiction, Comedy, Adventure]              1
[Drama, Thriller, Horror]                         1
[Comedy, Drama, Romance, TV Movie]                1
Name: count, Length: 1175, dtype: int64

In [37]:
# Finding Empty Values in the previously packed columns
def count_empty_lists(val):
    if isinstance(val, list):
        return 1 if not val else 0
    else:
        return 0

empty_counts = clean1_df.applymap(count_empty_lists)

# sum the empty_counts column-wise to get the total empty lists in each column
total_empty_lists = empty_counts.sum(axis=0)

# print the result
print(total_empty_lists)

title                    0
genres_list             28
keywords_list          412
prod_companies_list    351
cast_list               43
director_list           30
dtype: int64


In [38]:
# Replacing empty values with values that wont interfere with the similarity matrix
def replace_empty_values(row):
    for col in clean1_df.columns:
        if len(row[col]) == 0:
            for other_col in clean1_df.columns:
                if col != other_col and len(row[other_col]) > 0:
                    row[col] = row[other_col]
                    break
    return row

clean1_df = clean1_df.apply(replace_empty_values, axis=1)

empty_counts = clean1_df.applymap(count_empty_lists)
total_empty_lists = empty_counts.sum(axis=0)
print(total_empty_lists)

title                  0
genres_list            0
keywords_list          0
prod_companies_list    0
cast_list              0
director_list          0
dtype: int64


### Extracting Important Words

In [39]:
# function to extract the N most important keywords
# N is set by parameter top_n_keywords (default = 5, but we'll take top 30)
# lowercase, nremove stopwords
def extract_key_words(input_str, top_n_keywords=30):
    r = Rake()
    r.extract_keywords_from_text(input_str.lower())
    key_words_dict_scores = r.get_word_degrees()
    sorted_key_words_dict_scores = sorted(key_words_dict_scores.items(), key=operator.itemgetter(1), reverse=True)
    sorted_dict = collections.OrderedDict(sorted_key_words_dict_scores)
    
    # Change top_n_keywords as appropriate
    return sorted_dict, list(sorted_dict.keys())[:top_n_keywords]

In [40]:
clean1_df['overview_key_list'] = movies_df['overview'].apply(lambda x: extract_key_words(x)[1])

In [41]:
clean1_df

Unnamed: 0,title,genres_list,keywords_list,prod_companies_list,cast_list,director_list,overview_key_list
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe, alien planet, cgi, marine, soldier, battle, love affair, anti war, power relations, mind and soul, 3d]","[Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment, Lightstorm Entertainment]","[Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang, Michelle Rodriguez, Giovanni Ribisi, Joel David Moore, CCH Pounder, Wes Studi, Laz Alonso, Dileep Rao, Matt Gerald, Sean Anthony Moran, Jason Whyte, Scott Lawrence, Kelly Kilgour, James Patrick Pitt, Sean Patrick Murphy, Peter Dillon, Kevin Dorman, Kelson Henderson, David Van Horn, Jacob Tomuri, Michael Blain-Rozgay, Jon Curry, Luke Hawker, Woody Schultz, Peter Mensah, Sonia Yee, Jahnel Curfman, Ilram Choi, Kyla Warren, Lisa Roumain, Debra Wilson, Chris Mala, Taylor Kibby, Jodie Landau, Julie Lamm, Cullen B. Madden, Joseph Brady Madden, Frankie Torres, Austin Wilson, Sara Wilson, Tamica Washington-Miller, Lucy Briant, Nathan Meister, Gerry Blair, Matthew Chamberlain, Paul Yates, Wray Wilson, James Gaylyn, Melvin Leno Clark III, Carvon Futrell, Brandon Jelkes, Micah Moch, Hanniyah Muhammad, Christopher Nolen, Christa Oliver, April Marie Thomas, Bravita A. Threatt, Colin Bleasdale, Mike Bodnar, Matt Clayton, Nicole Dionne, Jamie Harrison, Allan Henry, Anthony Ingruber, Ashley Jeffery, Dean Knowsley, Joseph Mika-Hunt, Terry Notary, Kai Pantano, Logan Pithyou, Stuart Pollock, Raja, Gareth Ruck, Rhian Sheehan, T. J. Storm, Jodie Taylor, Alicia Vela-Bailey, Richard Whiteside, Nikie Zambo, Julene Renee]",[James Cameron],"[22nd, century, paraplegic, marine, moon, pandora, unique, mission, becomes, torn, following, orders, alien, civilization, dispatched, protecting]"
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, shipwreck, strong woman, ship, alliance, calypso, afterlife, fighter, pirate, swashbuckler, aftercreditsstinger]","[Walt Disney Pictures, Jerry Bruckheimer Films, Second Mate Productions]","[Johnny Depp, Orlando Bloom, Keira Knightley, Stellan Skarsgård, Chow Yun-fat, Bill Nighy, Geoffrey Rush, Jack Davenport, Kevin McNally, Tom Hollander, Naomie Harris, Jonathan Pryce, Keith Richards, Lee Arenberg, Mackenzie Crook, Greg Ellis, David Bailie, Martin Klebba, David Schofield, Lauren Maher, Vanessa Branch, Angus Barnett, Giles New, Reggie Lee, Dominic Scott Kay, Takayo Fischer, David Meunier, Ho-Kwan Tse, Andy Beckwith, Peter Donald Badalamenti II, Christopher S. Capp, Keith Richards, Hakeem Kae-Kazim, Ghassan Massoud]",[Gore Verbinski],"[captain, barbossa, long, believed, come, back, elizabeth, swann, dead, life, headed, edge, earth, turner, nothing, quite, seems]"
2,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]","[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux, Ralph Fiennes, Monica Bellucci, Ben Whishaw, Naomie Harris, Dave Bautista, Andrew Scott, Rory Kinnear, Jesper Christensen, Alessandro Cremona, Stephanie Sigman, Tenoch Huerta, Adriana Paz, Domenico Fortunato, Marco Zingaro, Stefano Elfi DiClaudia, Ian Bonar, Tam Williams, Richard Banham, Pip Carter, Simon Lenagan, Alessandro Bressanello, Marc Zinga, Brigitte Millar, Adel Bencherif, Gediminas Adomaitis, Peppe Lanzetta, Francesco Arca, Matteo Taranto, Emilio Aniba, Benito Sagredo, Dai Tabuchi, George Lasha, Sargon Yelda, Andy Cheung, Erick Hayden, Oleg Mirochnikov, Antonio Salines, Miloud Mourad Benamara, Gido Schimanski, Nigel Barber, Patrice Naiambana, Stephane Cornicard, Gary Fannin, Sadao Ueda, Phillip Law, Wai Wong, Joseph Balderrama, Eiji Mihara, Junichi Kajioka, Victor Schefé, Harald Windisch, Tristan Matthiae, Detlef Bothe, Bodo Friesecke, Wilhem Iben, Noemi Krausz, Noah Saavedra, Francis Attakpah, Michael Glantschnig, Marlon Boess, Marie Wohlmuth, Lili Epply, Konstantin Gerlach, Lara Parmiani, Umit Ulgen, Amra Mallassi, Ziad Abaza, Walid Mumuni, Derek Horsham, Nari Blair-Mangat, Michael White, Adam McGrady, Nader Dernaika, Pezhmaan Alinia, Judi Dench, Neve Gachev, Karl Farrer, Kim Adis, Maurisa Selene Coleman, Matija Mondi Matović]",[Sam Mendes],"[bond, terrible, truth, behind, spectre, battles, political, forces, secret, service, alive, peels, back, cryptic, message, ’, past, sends, sinister, organization, trail, uncover, keep, layers, deceit, reveal]"
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante, cover-up, superhero, villainess, tragic hero, terrorism, destruction, catwoman, cat burglar, imax, flood, criminal underworld, batman]","[Legendary Pictures, Warner Bros., DC Entertainment, Syncopy]","[Christian Bale, Michael Caine, Gary Oldman, Anne Hathaway, Tom Hardy, Marion Cotillard, Joseph Gordon-Levitt, Morgan Freeman, Cillian Murphy, Juno Temple, Liam Neeson, Matthew Modine, Alon Aboutboul, Ben Mendelsohn, Nestor Carbonell, Josh Pence, Tom Conti, Joey King, Warren Brown, Daniel Sunjata, Sam Kennard, Aliash Tepina, Nick Julian, Miranda Nolan, Claire Julien, Aidan Gillen, Burn Gorman, Brett Cullen, Reggie Lee, Joseph Lyle Taylor, Chris Ellis, Duane Henry, James Harvey Ward, Gonzalo Menendez, Cameron Jack, Lex Daniel, Tyler Dean Flores, Thomas Lennon, Trevor White, Rob Brown, Fredric Lehne, Courtney Munch, Chris Hill, Travis Guba, Jay Benedict, Will Estes, David Dayan Fisher, Glen Powell, Russ Fega, Andres Perez-Molina, Brent Briscoe, John Nolan, Oliver Cotton, Mark Killeen, Sarah Goldberg, John MacMillan, Robert Wisdom, Ronnie Gene Blevins, John Hollingworth, Ian Bohen, Uri Gavriel, Noel Gugliemi, Hector Atreyu Ruiz, Patrick Cox, Aramis Knight, Josh Stewart, William Devane, David Gyasi, Patrick Jordan, Joshua Elijah Reese, Desmond Harrington, Mychael Bates, Tomas Arana, Peter Holden, David Monahan, Jillian Armenante, Aja Evans, Aldous Davidson, Wade Williams, Jake Canuso, Massi Furlan, Christopher Judge, Patrick Leahy, Todd Gearhart, Marc Abbink, Isiah Adams, Charlie Alejandro, Robert Arensen, Grant Babbitt, Fileena Bahris, Rick Bolander, Kyle Patrick Brennan, Scott Churchson, Bill Cowher, Graham Curry, Stephanie Domini Ehlert, John Farrer, Frank Fata, Christopher Bryan Gomez, Vito Grassi, ...]",[Christopher Nolan],"[attorney, gotham, dent, batman, city, district, harvey, police, department, assumes, responsibility, eight, years, later, mysterious, selina, kyle, new, terrorist, leader, dark, knight, resurfaces, protect, late, subsequently, hunted, encounters, villainous, bane]"
4,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edgar rice burroughs, alien race, superhuman strength, mars civilization, sword and planet, 19th century, 3d]",[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton, Willem Dafoe, Thomas Haden Church, Mark Strong, Ciarán Hinds, Dominic West, James Purefoy, Bryan Cranston, Polly Walker, Daryl Sabara, Arkie Reece, Davood Ghadami, Pippa Nixon, James Embree, Philip Philmar, Emily Tierney, Edmund Kente, Nicholas Woodeson, Kyle Agnew, Don Stark, Josh Daugherty, Jared Cyr, Christopher Goodman, Amanda Clayton, Joseph Billingiere]",[Andrew Stanton],"[carter, former, military, captain, reluctantly, becomes, embroiled, john, inexplicably, transported, exotic, planet, barsoom, epic, conflict, rediscovers, people, rests, war, weary, mysterious, mars, world, brink, collapse, humanity, realizes, survival, hands]"
...,...,...,...,...,...,...,...
4798,El Mariachi,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, paper knife, guitar case]",[Columbia Pictures],"[Carlos Gallardo, Jaime de Hoyos, Peter Marquardt, Reinol Martinez, Ramiro Gomez, Consuelo Gómez, Juan García]",[Robert Rodriguez],"[el, mariachi, guitar, town, case, henchmen, mistake, another, visitor, ..., around, trying, family, tradition, find, work, killer, drug, lord, wants, play, carry, unfortunately, tries, carries, guns, azul, chase, kill, get]"
4799,Newlyweds,"[Comedy, Romance]",Newlyweds,Newlyweds,"[Edward Burns, Kerry Bishé, Marsha Dietlein, Caitlin Fitzgerald, Daniella Pineda]",[Edward Burns],"[newlywed, couple, respective, sisters, honeymoon, upended, arrivals]"
4800,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investigation, team, postal worker]","[Front Street Pictures, Muse Entertainment Enterprises]","[Eric Mabius, Kristin Booth, Crystal Lowe, Geoff Gustafson, Benjamin Hollingsworth, Laci J Mailey, Daphne Zuniga]",[Scott Smith],"[office, dead, letter, seemingly, undeliverable, takes, past, save, lives, reunite, old, loves, dedicated, quartet, civil, servants, postal, system, elite, team, mail, detectives, post, unpredictable, world, solve, crimes, change, futures, arriving]"
4801,Shanghai Calling,Shanghai Calling,Shanghai Calling,Shanghai Calling,"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan Ruck, Zhu Shimao]",[Daniel Hsia],"[sam, ambitious, new, york, attorney, legal, beautiful, relocation, specialist, smart, assistant, shanghai, immediately, stumbles, mess, could, end, connected, old, clever, journalist, might, find, romance, com, )., sent, assignment, career, help]"


In [42]:
# Create function to remove punctuation
import string

def no_punc(mylist):
    newlist = []
    for element in mylist:
        translator = str.maketrans('','',string.punctuation)
        newelement = element.translate(translator)
        newlist.append(newelement)
    return newlist

In [43]:
# Apply function to remove punctuation, for every value in every column except the 'title' column

clean1_df.iloc[:,1:] = clean1_df.iloc[:,1:].applymap(no_punc)
clean1_df

Unnamed: 0,title,genres_list,keywords_list,prod_companies_list,cast_list,director_list,overview_key_list
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe, alien planet, cgi, marine, soldier, battle, love affair, anti war, power relations, mind and soul, 3d]","[Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment, Lightstorm Entertainment]","[Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang, Michelle Rodriguez, Giovanni Ribisi, Joel David Moore, CCH Pounder, Wes Studi, Laz Alonso, Dileep Rao, Matt Gerald, Sean Anthony Moran, Jason Whyte, Scott Lawrence, Kelly Kilgour, James Patrick Pitt, Sean Patrick Murphy, Peter Dillon, Kevin Dorman, Kelson Henderson, David Van Horn, Jacob Tomuri, Michael BlainRozgay, Jon Curry, Luke Hawker, Woody Schultz, Peter Mensah, Sonia Yee, Jahnel Curfman, Ilram Choi, Kyla Warren, Lisa Roumain, Debra Wilson, Chris Mala, Taylor Kibby, Jodie Landau, Julie Lamm, Cullen B Madden, Joseph Brady Madden, Frankie Torres, Austin Wilson, Sara Wilson, Tamica WashingtonMiller, Lucy Briant, Nathan Meister, Gerry Blair, Matthew Chamberlain, Paul Yates, Wray Wilson, James Gaylyn, Melvin Leno Clark III, Carvon Futrell, Brandon Jelkes, Micah Moch, Hanniyah Muhammad, Christopher Nolen, Christa Oliver, April Marie Thomas, Bravita A Threatt, Colin Bleasdale, Mike Bodnar, Matt Clayton, Nicole Dionne, Jamie Harrison, Allan Henry, Anthony Ingruber, Ashley Jeffery, Dean Knowsley, Joseph MikaHunt, Terry Notary, Kai Pantano, Logan Pithyou, Stuart Pollock, Raja, Gareth Ruck, Rhian Sheehan, T J Storm, Jodie Taylor, Alicia VelaBailey, Richard Whiteside, Nikie Zambo, Julene Renee]",[James Cameron],"[22nd, century, paraplegic, marine, moon, pandora, unique, mission, becomes, torn, following, orders, alien, civilization, dispatched, protecting]"
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india trading company, love of ones life, traitor, shipwreck, strong woman, ship, alliance, calypso, afterlife, fighter, pirate, swashbuckler, aftercreditsstinger]","[Walt Disney Pictures, Jerry Bruckheimer Films, Second Mate Productions]","[Johnny Depp, Orlando Bloom, Keira Knightley, Stellan Skarsgård, Chow Yunfat, Bill Nighy, Geoffrey Rush, Jack Davenport, Kevin McNally, Tom Hollander, Naomie Harris, Jonathan Pryce, Keith Richards, Lee Arenberg, Mackenzie Crook, Greg Ellis, David Bailie, Martin Klebba, David Schofield, Lauren Maher, Vanessa Branch, Angus Barnett, Giles New, Reggie Lee, Dominic Scott Kay, Takayo Fischer, David Meunier, HoKwan Tse, Andy Beckwith, Peter Donald Badalamenti II, Christopher S Capp, Keith Richards, Hakeem KaeKazim, Ghassan Massoud]",[Gore Verbinski],"[captain, barbossa, long, believed, come, back, elizabeth, swann, dead, life, headed, edge, earth, turner, nothing, quite, seems]"
2,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]","[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux, Ralph Fiennes, Monica Bellucci, Ben Whishaw, Naomie Harris, Dave Bautista, Andrew Scott, Rory Kinnear, Jesper Christensen, Alessandro Cremona, Stephanie Sigman, Tenoch Huerta, Adriana Paz, Domenico Fortunato, Marco Zingaro, Stefano Elfi DiClaudia, Ian Bonar, Tam Williams, Richard Banham, Pip Carter, Simon Lenagan, Alessandro Bressanello, Marc Zinga, Brigitte Millar, Adel Bencherif, Gediminas Adomaitis, Peppe Lanzetta, Francesco Arca, Matteo Taranto, Emilio Aniba, Benito Sagredo, Dai Tabuchi, George Lasha, Sargon Yelda, Andy Cheung, Erick Hayden, Oleg Mirochnikov, Antonio Salines, Miloud Mourad Benamara, Gido Schimanski, Nigel Barber, Patrice Naiambana, Stephane Cornicard, Gary Fannin, Sadao Ueda, Phillip Law, Wai Wong, Joseph Balderrama, Eiji Mihara, Junichi Kajioka, Victor Schefé, Harald Windisch, Tristan Matthiae, Detlef Bothe, Bodo Friesecke, Wilhem Iben, Noemi Krausz, Noah Saavedra, Francis Attakpah, Michael Glantschnig, Marlon Boess, Marie Wohlmuth, Lili Epply, Konstantin Gerlach, Lara Parmiani, Umit Ulgen, Amra Mallassi, Ziad Abaza, Walid Mumuni, Derek Horsham, Nari BlairMangat, Michael White, Adam McGrady, Nader Dernaika, Pezhmaan Alinia, Judi Dench, Neve Gachev, Karl Farrer, Kim Adis, Maurisa Selene Coleman, Matija Mondi Matović]",[Sam Mendes],"[bond, terrible, truth, behind, spectre, battles, political, forces, secret, service, alive, peels, back, cryptic, message, ’, past, sends, sinister, organization, trail, uncover, keep, layers, deceit, reveal]"
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante, coverup, superhero, villainess, tragic hero, terrorism, destruction, catwoman, cat burglar, imax, flood, criminal underworld, batman]","[Legendary Pictures, Warner Bros, DC Entertainment, Syncopy]","[Christian Bale, Michael Caine, Gary Oldman, Anne Hathaway, Tom Hardy, Marion Cotillard, Joseph GordonLevitt, Morgan Freeman, Cillian Murphy, Juno Temple, Liam Neeson, Matthew Modine, Alon Aboutboul, Ben Mendelsohn, Nestor Carbonell, Josh Pence, Tom Conti, Joey King, Warren Brown, Daniel Sunjata, Sam Kennard, Aliash Tepina, Nick Julian, Miranda Nolan, Claire Julien, Aidan Gillen, Burn Gorman, Brett Cullen, Reggie Lee, Joseph Lyle Taylor, Chris Ellis, Duane Henry, James Harvey Ward, Gonzalo Menendez, Cameron Jack, Lex Daniel, Tyler Dean Flores, Thomas Lennon, Trevor White, Rob Brown, Fredric Lehne, Courtney Munch, Chris Hill, Travis Guba, Jay Benedict, Will Estes, David Dayan Fisher, Glen Powell, Russ Fega, Andres PerezMolina, Brent Briscoe, John Nolan, Oliver Cotton, Mark Killeen, Sarah Goldberg, John MacMillan, Robert Wisdom, Ronnie Gene Blevins, John Hollingworth, Ian Bohen, Uri Gavriel, Noel Gugliemi, Hector Atreyu Ruiz, Patrick Cox, Aramis Knight, Josh Stewart, William Devane, David Gyasi, Patrick Jordan, Joshua Elijah Reese, Desmond Harrington, Mychael Bates, Tomas Arana, Peter Holden, David Monahan, Jillian Armenante, Aja Evans, Aldous Davidson, Wade Williams, Jake Canuso, Massi Furlan, Christopher Judge, Patrick Leahy, Todd Gearhart, Marc Abbink, Isiah Adams, Charlie Alejandro, Robert Arensen, Grant Babbitt, Fileena Bahris, Rick Bolander, Kyle Patrick Brennan, Scott Churchson, Bill Cowher, Graham Curry, Stephanie Domini Ehlert, John Farrer, Frank Fata, Christopher Bryan Gomez, Vito Grassi, ...]",[Christopher Nolan],"[attorney, gotham, dent, batman, city, district, harvey, police, department, assumes, responsibility, eight, years, later, mysterious, selina, kyle, new, terrorist, leader, dark, knight, resurfaces, protect, late, subsequently, hunted, encounters, villainous, bane]"
4,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edgar rice burroughs, alien race, superhuman strength, mars civilization, sword and planet, 19th century, 3d]",[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton, Willem Dafoe, Thomas Haden Church, Mark Strong, Ciarán Hinds, Dominic West, James Purefoy, Bryan Cranston, Polly Walker, Daryl Sabara, Arkie Reece, Davood Ghadami, Pippa Nixon, James Embree, Philip Philmar, Emily Tierney, Edmund Kente, Nicholas Woodeson, Kyle Agnew, Don Stark, Josh Daugherty, Jared Cyr, Christopher Goodman, Amanda Clayton, Joseph Billingiere]",[Andrew Stanton],"[carter, former, military, captain, reluctantly, becomes, embroiled, john, inexplicably, transported, exotic, planet, barsoom, epic, conflict, rediscovers, people, rests, war, weary, mysterious, mars, world, brink, collapse, humanity, realizes, survival, hands]"
...,...,...,...,...,...,...,...
4798,El Mariachi,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, paper knife, guitar case]",[Columbia Pictures],"[Carlos Gallardo, Jaime de Hoyos, Peter Marquardt, Reinol Martinez, Ramiro Gomez, Consuelo Gómez, Juan García]",[Robert Rodriguez],"[el, mariachi, guitar, town, case, henchmen, mistake, another, visitor, , around, trying, family, tradition, find, work, killer, drug, lord, wants, play, carry, unfortunately, tries, carries, guns, azul, chase, kill, get]"
4799,Newlyweds,"[Comedy, Romance]","[N, e, w, l, y, w, e, d, s]","[N, e, w, l, y, w, e, d, s]","[Edward Burns, Kerry Bishé, Marsha Dietlein, Caitlin Fitzgerald, Daniella Pineda]",[Edward Burns],"[newlywed, couple, respective, sisters, honeymoon, upended, arrivals]"
4800,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investigation, team, postal worker]","[Front Street Pictures, Muse Entertainment Enterprises]","[Eric Mabius, Kristin Booth, Crystal Lowe, Geoff Gustafson, Benjamin Hollingsworth, Laci J Mailey, Daphne Zuniga]",[Scott Smith],"[office, dead, letter, seemingly, undeliverable, takes, past, save, lives, reunite, old, loves, dedicated, quartet, civil, servants, postal, system, elite, team, mail, detectives, post, unpredictable, world, solve, crimes, change, futures, arriving]"
4801,Shanghai Calling,"[S, h, a, n, g, h, a, i, , C, a, l, l, i, n, g]","[S, h, a, n, g, h, a, i, , C, a, l, l, i, n, g]","[S, h, a, n, g, h, a, i, , C, a, l, l, i, n, g]","[Daniel Henney, Eliza Coupe, Bill Paxton, Alan Ruck, Zhu Shimao]",[Daniel Hsia],"[sam, ambitious, new, york, attorney, legal, beautiful, relocation, specialist, smart, assistant, shanghai, immediately, stumbles, mess, could, end, connected, old, clever, journalist, might, find, romance, com, , sent, assignment, career, help]"


In [44]:
# Creating function to lowercase all the letters in the list

def lowercase(mylist):
    newlist = []
    for element in mylist:
        newelement = element.lower()
        newlist.append(newelement)
    return newlist

In [45]:
# Apply function to lower case all elements, for every value in every column except the 'title' column
# since will be redundant for 'overview' feature

clean1_df.iloc[:,1:] = clean1_df.iloc[:,1:].applymap(lowercase)
clean1_df.head()

Unnamed: 0,title,genres_list,keywords_list,prod_companies_list,cast_list,director_list,overview_key_list
0,Avatar,"[action, adventure, fantasy, science fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe, alien planet, cgi, marine, soldier, battle, love affair, anti war, power relations, mind and soul, 3d]","[ingenious film partners, twentieth century fox film corporation, dune entertainment, lightstorm entertainment]","[sam worthington, zoe saldana, sigourney weaver, stephen lang, michelle rodriguez, giovanni ribisi, joel david moore, cch pounder, wes studi, laz alonso, dileep rao, matt gerald, sean anthony moran, jason whyte, scott lawrence, kelly kilgour, james patrick pitt, sean patrick murphy, peter dillon, kevin dorman, kelson henderson, david van horn, jacob tomuri, michael blainrozgay, jon curry, luke hawker, woody schultz, peter mensah, sonia yee, jahnel curfman, ilram choi, kyla warren, lisa roumain, debra wilson, chris mala, taylor kibby, jodie landau, julie lamm, cullen b madden, joseph brady madden, frankie torres, austin wilson, sara wilson, tamica washingtonmiller, lucy briant, nathan meister, gerry blair, matthew chamberlain, paul yates, wray wilson, james gaylyn, melvin leno clark iii, carvon futrell, brandon jelkes, micah moch, hanniyah muhammad, christopher nolen, christa oliver, april marie thomas, bravita a threatt, colin bleasdale, mike bodnar, matt clayton, nicole dionne, jamie harrison, allan henry, anthony ingruber, ashley jeffery, dean knowsley, joseph mikahunt, terry notary, kai pantano, logan pithyou, stuart pollock, raja, gareth ruck, rhian sheehan, t j storm, jodie taylor, alicia velabailey, richard whiteside, nikie zambo, julene renee]",[james cameron],"[22nd, century, paraplegic, marine, moon, pandora, unique, mission, becomes, torn, following, orders, alien, civilization, dispatched, protecting]"
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drug abuse, exotic island, east india trading company, love of ones life, traitor, shipwreck, strong woman, ship, alliance, calypso, afterlife, fighter, pirate, swashbuckler, aftercreditsstinger]","[walt disney pictures, jerry bruckheimer films, second mate productions]","[johnny depp, orlando bloom, keira knightley, stellan skarsgård, chow yunfat, bill nighy, geoffrey rush, jack davenport, kevin mcnally, tom hollander, naomie harris, jonathan pryce, keith richards, lee arenberg, mackenzie crook, greg ellis, david bailie, martin klebba, david schofield, lauren maher, vanessa branch, angus barnett, giles new, reggie lee, dominic scott kay, takayo fischer, david meunier, hokwan tse, andy beckwith, peter donald badalamenti ii, christopher s capp, keith richards, hakeem kaekazim, ghassan massoud]",[gore verbinski],"[captain, barbossa, long, believed, come, back, elizabeth, swann, dead, life, headed, edge, earth, turner, nothing, quite, seems]"
2,Spectre,"[action, adventure, crime]","[spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom]","[columbia pictures, danjaq, b24]","[daniel craig, christoph waltz, léa seydoux, ralph fiennes, monica bellucci, ben whishaw, naomie harris, dave bautista, andrew scott, rory kinnear, jesper christensen, alessandro cremona, stephanie sigman, tenoch huerta, adriana paz, domenico fortunato, marco zingaro, stefano elfi diclaudia, ian bonar, tam williams, richard banham, pip carter, simon lenagan, alessandro bressanello, marc zinga, brigitte millar, adel bencherif, gediminas adomaitis, peppe lanzetta, francesco arca, matteo taranto, emilio aniba, benito sagredo, dai tabuchi, george lasha, sargon yelda, andy cheung, erick hayden, oleg mirochnikov, antonio salines, miloud mourad benamara, gido schimanski, nigel barber, patrice naiambana, stephane cornicard, gary fannin, sadao ueda, phillip law, wai wong, joseph balderrama, eiji mihara, junichi kajioka, victor schefé, harald windisch, tristan matthiae, detlef bothe, bodo friesecke, wilhem iben, noemi krausz, noah saavedra, francis attakpah, michael glantschnig, marlon boess, marie wohlmuth, lili epply, konstantin gerlach, lara parmiani, umit ulgen, amra mallassi, ziad abaza, walid mumuni, derek horsham, nari blairmangat, michael white, adam mcgrady, nader dernaika, pezhmaan alinia, judi dench, neve gachev, karl farrer, kim adis, maurisa selene coleman, matija mondi matović]",[sam mendes],"[bond, terrible, truth, behind, spectre, battles, political, forces, secret, service, alive, peels, back, cryptic, message, ’, past, sends, sinister, organization, trail, uncover, keep, layers, deceit, reveal]"
3,The Dark Knight Rises,"[action, crime, drama, thriller]","[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante, coverup, superhero, villainess, tragic hero, terrorism, destruction, catwoman, cat burglar, imax, flood, criminal underworld, batman]","[legendary pictures, warner bros, dc entertainment, syncopy]","[christian bale, michael caine, gary oldman, anne hathaway, tom hardy, marion cotillard, joseph gordonlevitt, morgan freeman, cillian murphy, juno temple, liam neeson, matthew modine, alon aboutboul, ben mendelsohn, nestor carbonell, josh pence, tom conti, joey king, warren brown, daniel sunjata, sam kennard, aliash tepina, nick julian, miranda nolan, claire julien, aidan gillen, burn gorman, brett cullen, reggie lee, joseph lyle taylor, chris ellis, duane henry, james harvey ward, gonzalo menendez, cameron jack, lex daniel, tyler dean flores, thomas lennon, trevor white, rob brown, fredric lehne, courtney munch, chris hill, travis guba, jay benedict, will estes, david dayan fisher, glen powell, russ fega, andres perezmolina, brent briscoe, john nolan, oliver cotton, mark killeen, sarah goldberg, john macmillan, robert wisdom, ronnie gene blevins, john hollingworth, ian bohen, uri gavriel, noel gugliemi, hector atreyu ruiz, patrick cox, aramis knight, josh stewart, william devane, david gyasi, patrick jordan, joshua elijah reese, desmond harrington, mychael bates, tomas arana, peter holden, david monahan, jillian armenante, aja evans, aldous davidson, wade williams, jake canuso, massi furlan, christopher judge, patrick leahy, todd gearhart, marc abbink, isiah adams, charlie alejandro, robert arensen, grant babbitt, fileena bahris, rick bolander, kyle patrick brennan, scott churchson, bill cowher, graham curry, stephanie domini ehlert, john farrer, frank fata, christopher bryan gomez, vito grassi, ...]",[christopher nolan],"[attorney, gotham, dent, batman, city, district, harvey, police, department, assumes, responsibility, eight, years, later, mysterious, selina, kyle, new, terrorist, leader, dark, knight, resurfaces, protect, late, subsequently, hunted, encounters, villainous, bane]"
4,John Carter,"[action, adventure, science fiction]","[based on novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edgar rice burroughs, alien race, superhuman strength, mars civilization, sword and planet, 19th century, 3d]",[walt disney pictures],"[taylor kitsch, lynn collins, samantha morton, willem dafoe, thomas haden church, mark strong, ciarán hinds, dominic west, james purefoy, bryan cranston, polly walker, daryl sabara, arkie reece, davood ghadami, pippa nixon, james embree, philip philmar, emily tierney, edmund kente, nicholas woodeson, kyle agnew, don stark, josh daugherty, jared cyr, christopher goodman, amanda clayton, joseph billingiere]",[andrew stanton],"[carter, former, military, captain, reluctantly, becomes, embroiled, john, inexplicably, transported, exotic, planet, barsoom, epic, conflict, rediscovers, people, rests, war, weary, mysterious, mars, world, brink, collapse, humanity, realizes, survival, hands]"


In [46]:
# The feature 'keywords' may contain stopwords, since the elements are phrases instead of names
# I need to tokenize each element of each list, then I will remove stopwords

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(mylist):
    stop_words = set(stopwords.words('english'))
    newlist = []
    for element in mylist:
        words = word_tokenize(element)
        filtered_words = [word for word in words if word not in stop_words]
        newlist.append(' '.join(filtered_words))
    return newlist

In [47]:
clean1_df['keywords_list'] = clean1_df['keywords_list'].apply(remove_stopwords)
clean1_df.head()

Unnamed: 0,title,genres_list,keywords_list,prod_companies_list,cast_list,director_list,overview_key_list
0,Avatar,"[action, adventure, fantasy, science fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe, alien planet, cgi, marine, soldier, battle, love affair, anti war, power relations, mind soul, 3d]","[ingenious film partners, twentieth century fox film corporation, dune entertainment, lightstorm entertainment]","[sam worthington, zoe saldana, sigourney weaver, stephen lang, michelle rodriguez, giovanni ribisi, joel david moore, cch pounder, wes studi, laz alonso, dileep rao, matt gerald, sean anthony moran, jason whyte, scott lawrence, kelly kilgour, james patrick pitt, sean patrick murphy, peter dillon, kevin dorman, kelson henderson, david van horn, jacob tomuri, michael blainrozgay, jon curry, luke hawker, woody schultz, peter mensah, sonia yee, jahnel curfman, ilram choi, kyla warren, lisa roumain, debra wilson, chris mala, taylor kibby, jodie landau, julie lamm, cullen b madden, joseph brady madden, frankie torres, austin wilson, sara wilson, tamica washingtonmiller, lucy briant, nathan meister, gerry blair, matthew chamberlain, paul yates, wray wilson, james gaylyn, melvin leno clark iii, carvon futrell, brandon jelkes, micah moch, hanniyah muhammad, christopher nolen, christa oliver, april marie thomas, bravita a threatt, colin bleasdale, mike bodnar, matt clayton, nicole dionne, jamie harrison, allan henry, anthony ingruber, ashley jeffery, dean knowsley, joseph mikahunt, terry notary, kai pantano, logan pithyou, stuart pollock, raja, gareth ruck, rhian sheehan, t j storm, jodie taylor, alicia velabailey, richard whiteside, nikie zambo, julene renee]",[james cameron],"[22nd, century, paraplegic, marine, moon, pandora, unique, mission, becomes, torn, following, orders, alien, civilization, dispatched, protecting]"
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drug abuse, exotic island, east india trading company, love ones life, traitor, shipwreck, strong woman, ship, alliance, calypso, afterlife, fighter, pirate, swashbuckler, aftercreditsstinger]","[walt disney pictures, jerry bruckheimer films, second mate productions]","[johnny depp, orlando bloom, keira knightley, stellan skarsgård, chow yunfat, bill nighy, geoffrey rush, jack davenport, kevin mcnally, tom hollander, naomie harris, jonathan pryce, keith richards, lee arenberg, mackenzie crook, greg ellis, david bailie, martin klebba, david schofield, lauren maher, vanessa branch, angus barnett, giles new, reggie lee, dominic scott kay, takayo fischer, david meunier, hokwan tse, andy beckwith, peter donald badalamenti ii, christopher s capp, keith richards, hakeem kaekazim, ghassan massoud]",[gore verbinski],"[captain, barbossa, long, believed, come, back, elizabeth, swann, dead, life, headed, edge, earth, turner, nothing, quite, seems]"
2,Spectre,"[action, adventure, crime]","[spy, based novel, secret agent, sequel, mi6, british secret service, united kingdom]","[columbia pictures, danjaq, b24]","[daniel craig, christoph waltz, léa seydoux, ralph fiennes, monica bellucci, ben whishaw, naomie harris, dave bautista, andrew scott, rory kinnear, jesper christensen, alessandro cremona, stephanie sigman, tenoch huerta, adriana paz, domenico fortunato, marco zingaro, stefano elfi diclaudia, ian bonar, tam williams, richard banham, pip carter, simon lenagan, alessandro bressanello, marc zinga, brigitte millar, adel bencherif, gediminas adomaitis, peppe lanzetta, francesco arca, matteo taranto, emilio aniba, benito sagredo, dai tabuchi, george lasha, sargon yelda, andy cheung, erick hayden, oleg mirochnikov, antonio salines, miloud mourad benamara, gido schimanski, nigel barber, patrice naiambana, stephane cornicard, gary fannin, sadao ueda, phillip law, wai wong, joseph balderrama, eiji mihara, junichi kajioka, victor schefé, harald windisch, tristan matthiae, detlef bothe, bodo friesecke, wilhem iben, noemi krausz, noah saavedra, francis attakpah, michael glantschnig, marlon boess, marie wohlmuth, lili epply, konstantin gerlach, lara parmiani, umit ulgen, amra mallassi, ziad abaza, walid mumuni, derek horsham, nari blairmangat, michael white, adam mcgrady, nader dernaika, pezhmaan alinia, judi dench, neve gachev, karl farrer, kim adis, maurisa selene coleman, matija mondi matović]",[sam mendes],"[bond, terrible, truth, behind, spectre, battles, political, forces, secret, service, alive, peels, back, cryptic, message, ’, past, sends, sinister, organization, trail, uncover, keep, layers, deceit, reveal]"
3,The Dark Knight Rises,"[action, crime, drama, thriller]","[dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante, coverup, superhero, villainess, tragic hero, terrorism, destruction, catwoman, cat burglar, imax, flood, criminal underworld, batman]","[legendary pictures, warner bros, dc entertainment, syncopy]","[christian bale, michael caine, gary oldman, anne hathaway, tom hardy, marion cotillard, joseph gordonlevitt, morgan freeman, cillian murphy, juno temple, liam neeson, matthew modine, alon aboutboul, ben mendelsohn, nestor carbonell, josh pence, tom conti, joey king, warren brown, daniel sunjata, sam kennard, aliash tepina, nick julian, miranda nolan, claire julien, aidan gillen, burn gorman, brett cullen, reggie lee, joseph lyle taylor, chris ellis, duane henry, james harvey ward, gonzalo menendez, cameron jack, lex daniel, tyler dean flores, thomas lennon, trevor white, rob brown, fredric lehne, courtney munch, chris hill, travis guba, jay benedict, will estes, david dayan fisher, glen powell, russ fega, andres perezmolina, brent briscoe, john nolan, oliver cotton, mark killeen, sarah goldberg, john macmillan, robert wisdom, ronnie gene blevins, john hollingworth, ian bohen, uri gavriel, noel gugliemi, hector atreyu ruiz, patrick cox, aramis knight, josh stewart, william devane, david gyasi, patrick jordan, joshua elijah reese, desmond harrington, mychael bates, tomas arana, peter holden, david monahan, jillian armenante, aja evans, aldous davidson, wade williams, jake canuso, massi furlan, christopher judge, patrick leahy, todd gearhart, marc abbink, isiah adams, charlie alejandro, robert arensen, grant babbitt, fileena bahris, rick bolander, kyle patrick brennan, scott churchson, bill cowher, graham curry, stephanie domini ehlert, john farrer, frank fata, christopher bryan gomez, vito grassi, ...]",[christopher nolan],"[attorney, gotham, dent, batman, city, district, harvey, police, department, assumes, responsibility, eight, years, later, mysterious, selina, kyle, new, terrorist, leader, dark, knight, resurfaces, protect, late, subsequently, hunted, encounters, villainous, bane]"
4,John Carter,"[action, adventure, science fiction]","[based novel, mars, medallion, space travel, princess, alien, steampunk, martian, escape, edgar rice burroughs, alien race, superhuman strength, mars civilization, sword planet, 19th century, 3d]",[walt disney pictures],"[taylor kitsch, lynn collins, samantha morton, willem dafoe, thomas haden church, mark strong, ciarán hinds, dominic west, james purefoy, bryan cranston, polly walker, daryl sabara, arkie reece, davood ghadami, pippa nixon, james embree, philip philmar, emily tierney, edmund kente, nicholas woodeson, kyle agnew, don stark, josh daugherty, jared cyr, christopher goodman, amanda clayton, joseph billingiere]",[andrew stanton],"[carter, former, military, captain, reluctantly, becomes, embroiled, john, inexplicably, transported, exotic, planet, barsoom, epic, conflict, rediscovers, people, rests, war, weary, mysterious, mars, world, brink, collapse, humanity, realizes, survival, hands]"


In [48]:
## Each element of each lists are strings. We will remove spaces to create a unique string.
## e.g. 'tom sawyer' and 'tom kitsch' have similarities, but 'tomsawyer' and 'tomkitsch' are distinct.

def remove_space(mylist):
    newlist = []
    newlist = [element.replace(' ', '') for element in mylist]
    return newlist

In [49]:
# Apply function to remove space from all elements, for 'prod_companies_list' , 'cast_list' , 'director_list' features
# unnecessary for 'overview' feature

clean1_df[['genres_list','keywords_list','prod_companies_list','cast_list','director_list']] = clean1_df[[ \
            'genres_list','keywords_list','prod_companies_list','cast_list','director_list']].applymap(remove_space)
clean1_df.head()

Unnamed: 0,title,genres_list,keywords_list,prod_companies_list,cast_list,director_list,overview_key_list
0,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, society, spacetravel, futuristic, romance, space, alien, tribe, alienplanet, cgi, marine, soldier, battle, loveaffair, antiwar, powerrelations, mindsoul, 3d]","[ingeniousfilmpartners, twentiethcenturyfoxfilmcorporation, duneentertainment, lightstormentertainment]","[samworthington, zoesaldana, sigourneyweaver, stephenlang, michellerodriguez, giovanniribisi, joeldavidmoore, cchpounder, wesstudi, lazalonso, dileeprao, mattgerald, seananthonymoran, jasonwhyte, scottlawrence, kellykilgour, jamespatrickpitt, seanpatrickmurphy, peterdillon, kevindorman, kelsonhenderson, davidvanhorn, jacobtomuri, michaelblainrozgay, joncurry, lukehawker, woodyschultz, petermensah, soniayee, jahnelcurfman, ilramchoi, kylawarren, lisaroumain, debrawilson, chrismala, taylorkibby, jodielandau, julielamm, cullenbmadden, josephbradymadden, frankietorres, austinwilson, sarawilson, tamicawashingtonmiller, lucybriant, nathanmeister, gerryblair, matthewchamberlain, paulyates, wraywilson, jamesgaylyn, melvinlenoclarkiii, carvonfutrell, brandonjelkes, micahmoch, hanniyahmuhammad, christophernolen, christaoliver, aprilmariethomas, bravitaathreatt, colinbleasdale, mikebodnar, mattclayton, nicoledionne, jamieharrison, allanhenry, anthonyingruber, ashleyjeffery, deanknowsley, josephmikahunt, terrynotary, kaipantano, loganpithyou, stuartpollock, raja, garethruck, rhiansheehan, tjstorm, jodietaylor, aliciavelabailey, richardwhiteside, nikiezambo, julenerenee]",[jamescameron],"[22nd, century, paraplegic, marine, moon, pandora, unique, mission, becomes, torn, following, orders, alien, civilization, dispatched, protecting]"
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatradingcompany, loveoneslife, traitor, shipwreck, strongwoman, ship, alliance, calypso, afterlife, fighter, pirate, swashbuckler, aftercreditsstinger]","[waltdisneypictures, jerrybruckheimerfilms, secondmateproductions]","[johnnydepp, orlandobloom, keiraknightley, stellanskarsgård, chowyunfat, billnighy, geoffreyrush, jackdavenport, kevinmcnally, tomhollander, naomieharris, jonathanpryce, keithrichards, leearenberg, mackenziecrook, gregellis, davidbailie, martinklebba, davidschofield, laurenmaher, vanessabranch, angusbarnett, gilesnew, reggielee, dominicscottkay, takayofischer, davidmeunier, hokwantse, andybeckwith, peterdonaldbadalamentiii, christopherscapp, keithrichards, hakeemkaekazim, ghassanmassoud]",[goreverbinski],"[captain, barbossa, long, believed, come, back, elizabeth, swann, dead, life, headed, edge, earth, turner, nothing, quite, seems]"
2,Spectre,"[action, adventure, crime]","[spy, basednovel, secretagent, sequel, mi6, britishsecretservice, unitedkingdom]","[columbiapictures, danjaq, b24]","[danielcraig, christophwaltz, léaseydoux, ralphfiennes, monicabellucci, benwhishaw, naomieharris, davebautista, andrewscott, rorykinnear, jesperchristensen, alessandrocremona, stephaniesigman, tenochhuerta, adrianapaz, domenicofortunato, marcozingaro, stefanoelfidiclaudia, ianbonar, tamwilliams, richardbanham, pipcarter, simonlenagan, alessandrobressanello, marczinga, brigittemillar, adelbencherif, gediminasadomaitis, peppelanzetta, francescoarca, matteotaranto, emilioaniba, benitosagredo, daitabuchi, georgelasha, sargonyelda, andycheung, erickhayden, olegmirochnikov, antoniosalines, miloudmouradbenamara, gidoschimanski, nigelbarber, patricenaiambana, stephanecornicard, garyfannin, sadaoueda, philliplaw, waiwong, josephbalderrama, eijimihara, junichikajioka, victorschefé, haraldwindisch, tristanmatthiae, detlefbothe, bodofriesecke, wilhemiben, noemikrausz, noahsaavedra, francisattakpah, michaelglantschnig, marlonboess, mariewohlmuth, liliepply, konstantingerlach, laraparmiani, umitulgen, amramallassi, ziadabaza, walidmumuni, derekhorsham, nariblairmangat, michaelwhite, adammcgrady, naderdernaika, pezhmaanalinia, judidench, nevegachev, karlfarrer, kimadis, maurisaselenecoleman, matijamondimatović]",[sammendes],"[bond, terrible, truth, behind, spectre, battles, political, forces, secret, service, alive, peels, back, cryptic, message, ’, past, sends, sinister, organization, trail, uncover, keep, layers, deceit, reveal]"
3,The Dark Knight Rises,"[action, crime, drama, thriller]","[dccomics, crimefighter, terrorist, secretidentity, burglar, hostagedrama, timebomb, gothamcity, vigilante, coverup, superhero, villainess, tragichero, terrorism, destruction, catwoman, catburglar, imax, flood, criminalunderworld, batman]","[legendarypictures, warnerbros, dcentertainment, syncopy]","[christianbale, michaelcaine, garyoldman, annehathaway, tomhardy, marioncotillard, josephgordonlevitt, morganfreeman, cillianmurphy, junotemple, liamneeson, matthewmodine, alonaboutboul, benmendelsohn, nestorcarbonell, joshpence, tomconti, joeyking, warrenbrown, danielsunjata, samkennard, aliashtepina, nickjulian, mirandanolan, clairejulien, aidangillen, burngorman, brettcullen, reggielee, josephlyletaylor, chrisellis, duanehenry, jamesharveyward, gonzalomenendez, cameronjack, lexdaniel, tylerdeanflores, thomaslennon, trevorwhite, robbrown, fredriclehne, courtneymunch, chrishill, travisguba, jaybenedict, willestes, daviddayanfisher, glenpowell, russfega, andresperezmolina, brentbriscoe, johnnolan, olivercotton, markkilleen, sarahgoldberg, johnmacmillan, robertwisdom, ronniegeneblevins, johnhollingworth, ianbohen, urigavriel, noelgugliemi, hectoratreyuruiz, patrickcox, aramisknight, joshstewart, williamdevane, davidgyasi, patrickjordan, joshuaelijahreese, desmondharrington, mychaelbates, tomasarana, peterholden, davidmonahan, jillianarmenante, ajaevans, aldousdavidson, wadewilliams, jakecanuso, massifurlan, christopherjudge, patrickleahy, toddgearhart, marcabbink, isiahadams, charliealejandro, robertarensen, grantbabbitt, fileenabahris, rickbolander, kylepatrickbrennan, scottchurchson, billcowher, grahamcurry, stephaniedominiehlert, johnfarrer, frankfata, christopherbryangomez, vitograssi, ...]",[christophernolan],"[attorney, gotham, dent, batman, city, district, harvey, police, department, assumes, responsibility, eight, years, later, mysterious, selina, kyle, new, terrorist, leader, dark, knight, resurfaces, protect, late, subsequently, hunted, encounters, villainous, bane]"
4,John Carter,"[action, adventure, sciencefiction]","[basednovel, mars, medallion, spacetravel, princess, alien, steampunk, martian, escape, edgarriceburroughs, alienrace, superhumanstrength, marscivilization, swordplanet, 19thcentury, 3d]",[waltdisneypictures],"[taylorkitsch, lynncollins, samanthamorton, willemdafoe, thomashadenchurch, markstrong, ciaránhinds, dominicwest, jamespurefoy, bryancranston, pollywalker, darylsabara, arkiereece, davoodghadami, pippanixon, jamesembree, philipphilmar, emilytierney, edmundkente, nicholaswoodeson, kyleagnew, donstark, joshdaugherty, jaredcyr, christophergoodman, amandaclayton, josephbillingiere]",[andrewstanton],"[carter, former, military, captain, reluctantly, becomes, embroiled, john, inexplicably, transported, exotic, planet, barsoom, epic, conflict, rediscovers, people, rests, war, weary, mysterious, mars, world, brink, collapse, humanity, realizes, survival, hands]"


In [50]:
# Now that unique values are merged into one string each, I will be using only unigrams for the TF-IDF vectorizer.

### Tokenization

In [51]:
# The elements of the feature 'release_date' is in yyyy-mm-dd format. We will isolate yyyy.

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [52]:
## first, tokenize
clean1_df['year'] = movies_df['release_date'].apply(lambda x: tokenizer.tokenize(x) )
clean1_df[['year']].head()

Unnamed: 0,year
0,"[2009, 12, 10]"
1,"[2007, 05, 19]"
2,"[2015, 10, 26]"
3,"[2012, 07, 16]"
4,"[2012, 03, 07]"


In [53]:
## delete 2nd and 3rd element from each list, to retain only year
for i in range(0,len(clean1_df)):
    del clean1_df['year'][i][1:3]
    
clean1_df[['year']].head()

Unnamed: 0,year
0,[2009]
1,[2007]
2,[2015]
3,[2012]
4,[2012]


### Lemmatization

In [54]:
wn = nltk.WordNetLemmatizer()

# Creating function to apply lematizer
def lemmatize(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text


In [55]:
# Applying the Lemmatizer
clean1_df['overview_key_list'] = clean1_df['overview_key_list'].apply(lemmatize)

In [56]:
clean1_df[['overview_key_list']].head()

Unnamed: 0,overview_key_list
0,"[22nd, century, paraplegic, marine, moon, pandora, unique, mission, becomes, torn, following, order, alien, civilization, dispatched, protecting]"
1,"[captain, barbossa, long, believed, come, back, elizabeth, swann, dead, life, headed, edge, earth, turner, nothing, quite, seems]"
2,"[bond, terrible, truth, behind, spectre, battle, political, force, secret, service, alive, peel, back, cryptic, message, ’, past, sends, sinister, organization, trail, uncover, keep, layer, deceit, reveal]"
3,"[attorney, gotham, dent, batman, city, district, harvey, police, department, assumes, responsibility, eight, year, later, mysterious, selina, kyle, new, terrorist, leader, dark, knight, resurfaces, protect, late, subsequently, hunted, encounter, villainous, bane]"
4,"[carter, former, military, captain, reluctantly, becomes, embroiled, john, inexplicably, transported, exotic, planet, barsoom, epic, conflict, rediscovers, people, rest, war, weary, mysterious, mar, world, brink, collapse, humanity, realizes, survival, hand]"


In [57]:
# Add 'original_language' feature to clean dataframe while maintaining elements in list format for consistency
clean1_df['language'] = movies_df['original_language'].apply(lambda x: [x])
clean1_df.head()

Unnamed: 0,title,genres_list,keywords_list,prod_companies_list,cast_list,director_list,overview_key_list,year,language
0,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, society, spacetravel, futuristic, romance, space, alien, tribe, alienplanet, cgi, marine, soldier, battle, loveaffair, antiwar, powerrelations, mindsoul, 3d]","[ingeniousfilmpartners, twentiethcenturyfoxfilmcorporation, duneentertainment, lightstormentertainment]","[samworthington, zoesaldana, sigourneyweaver, stephenlang, michellerodriguez, giovanniribisi, joeldavidmoore, cchpounder, wesstudi, lazalonso, dileeprao, mattgerald, seananthonymoran, jasonwhyte, scottlawrence, kellykilgour, jamespatrickpitt, seanpatrickmurphy, peterdillon, kevindorman, kelsonhenderson, davidvanhorn, jacobtomuri, michaelblainrozgay, joncurry, lukehawker, woodyschultz, petermensah, soniayee, jahnelcurfman, ilramchoi, kylawarren, lisaroumain, debrawilson, chrismala, taylorkibby, jodielandau, julielamm, cullenbmadden, josephbradymadden, frankietorres, austinwilson, sarawilson, tamicawashingtonmiller, lucybriant, nathanmeister, gerryblair, matthewchamberlain, paulyates, wraywilson, jamesgaylyn, melvinlenoclarkiii, carvonfutrell, brandonjelkes, micahmoch, hanniyahmuhammad, christophernolen, christaoliver, aprilmariethomas, bravitaathreatt, colinbleasdale, mikebodnar, mattclayton, nicoledionne, jamieharrison, allanhenry, anthonyingruber, ashleyjeffery, deanknowsley, josephmikahunt, terrynotary, kaipantano, loganpithyou, stuartpollock, raja, garethruck, rhiansheehan, tjstorm, jodietaylor, aliciavelabailey, richardwhiteside, nikiezambo, julenerenee]",[jamescameron],"[22nd, century, paraplegic, marine, moon, pandora, unique, mission, becomes, torn, following, order, alien, civilization, dispatched, protecting]",[2009],[en]
1,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatradingcompany, loveoneslife, traitor, shipwreck, strongwoman, ship, alliance, calypso, afterlife, fighter, pirate, swashbuckler, aftercreditsstinger]","[waltdisneypictures, jerrybruckheimerfilms, secondmateproductions]","[johnnydepp, orlandobloom, keiraknightley, stellanskarsgård, chowyunfat, billnighy, geoffreyrush, jackdavenport, kevinmcnally, tomhollander, naomieharris, jonathanpryce, keithrichards, leearenberg, mackenziecrook, gregellis, davidbailie, martinklebba, davidschofield, laurenmaher, vanessabranch, angusbarnett, gilesnew, reggielee, dominicscottkay, takayofischer, davidmeunier, hokwantse, andybeckwith, peterdonaldbadalamentiii, christopherscapp, keithrichards, hakeemkaekazim, ghassanmassoud]",[goreverbinski],"[captain, barbossa, long, believed, come, back, elizabeth, swann, dead, life, headed, edge, earth, turner, nothing, quite, seems]",[2007],[en]
2,Spectre,"[action, adventure, crime]","[spy, basednovel, secretagent, sequel, mi6, britishsecretservice, unitedkingdom]","[columbiapictures, danjaq, b24]","[danielcraig, christophwaltz, léaseydoux, ralphfiennes, monicabellucci, benwhishaw, naomieharris, davebautista, andrewscott, rorykinnear, jesperchristensen, alessandrocremona, stephaniesigman, tenochhuerta, adrianapaz, domenicofortunato, marcozingaro, stefanoelfidiclaudia, ianbonar, tamwilliams, richardbanham, pipcarter, simonlenagan, alessandrobressanello, marczinga, brigittemillar, adelbencherif, gediminasadomaitis, peppelanzetta, francescoarca, matteotaranto, emilioaniba, benitosagredo, daitabuchi, georgelasha, sargonyelda, andycheung, erickhayden, olegmirochnikov, antoniosalines, miloudmouradbenamara, gidoschimanski, nigelbarber, patricenaiambana, stephanecornicard, garyfannin, sadaoueda, philliplaw, waiwong, josephbalderrama, eijimihara, junichikajioka, victorschefé, haraldwindisch, tristanmatthiae, detlefbothe, bodofriesecke, wilhemiben, noemikrausz, noahsaavedra, francisattakpah, michaelglantschnig, marlonboess, mariewohlmuth, liliepply, konstantingerlach, laraparmiani, umitulgen, amramallassi, ziadabaza, walidmumuni, derekhorsham, nariblairmangat, michaelwhite, adammcgrady, naderdernaika, pezhmaanalinia, judidench, nevegachev, karlfarrer, kimadis, maurisaselenecoleman, matijamondimatović]",[sammendes],"[bond, terrible, truth, behind, spectre, battle, political, force, secret, service, alive, peel, back, cryptic, message, ’, past, sends, sinister, organization, trail, uncover, keep, layer, deceit, reveal]",[2015],[en]
3,The Dark Knight Rises,"[action, crime, drama, thriller]","[dccomics, crimefighter, terrorist, secretidentity, burglar, hostagedrama, timebomb, gothamcity, vigilante, coverup, superhero, villainess, tragichero, terrorism, destruction, catwoman, catburglar, imax, flood, criminalunderworld, batman]","[legendarypictures, warnerbros, dcentertainment, syncopy]","[christianbale, michaelcaine, garyoldman, annehathaway, tomhardy, marioncotillard, josephgordonlevitt, morganfreeman, cillianmurphy, junotemple, liamneeson, matthewmodine, alonaboutboul, benmendelsohn, nestorcarbonell, joshpence, tomconti, joeyking, warrenbrown, danielsunjata, samkennard, aliashtepina, nickjulian, mirandanolan, clairejulien, aidangillen, burngorman, brettcullen, reggielee, josephlyletaylor, chrisellis, duanehenry, jamesharveyward, gonzalomenendez, cameronjack, lexdaniel, tylerdeanflores, thomaslennon, trevorwhite, robbrown, fredriclehne, courtneymunch, chrishill, travisguba, jaybenedict, willestes, daviddayanfisher, glenpowell, russfega, andresperezmolina, brentbriscoe, johnnolan, olivercotton, markkilleen, sarahgoldberg, johnmacmillan, robertwisdom, ronniegeneblevins, johnhollingworth, ianbohen, urigavriel, noelgugliemi, hectoratreyuruiz, patrickcox, aramisknight, joshstewart, williamdevane, davidgyasi, patrickjordan, joshuaelijahreese, desmondharrington, mychaelbates, tomasarana, peterholden, davidmonahan, jillianarmenante, ajaevans, aldousdavidson, wadewilliams, jakecanuso, massifurlan, christopherjudge, patrickleahy, toddgearhart, marcabbink, isiahadams, charliealejandro, robertarensen, grantbabbitt, fileenabahris, rickbolander, kylepatrickbrennan, scottchurchson, billcowher, grahamcurry, stephaniedominiehlert, johnfarrer, frankfata, christopherbryangomez, vitograssi, ...]",[christophernolan],"[attorney, gotham, dent, batman, city, district, harvey, police, department, assumes, responsibility, eight, year, later, mysterious, selina, kyle, new, terrorist, leader, dark, knight, resurfaces, protect, late, subsequently, hunted, encounter, villainous, bane]",[2012],[en]
4,John Carter,"[action, adventure, sciencefiction]","[basednovel, mars, medallion, spacetravel, princess, alien, steampunk, martian, escape, edgarriceburroughs, alienrace, superhumanstrength, marscivilization, swordplanet, 19thcentury, 3d]",[waltdisneypictures],"[taylorkitsch, lynncollins, samanthamorton, willemdafoe, thomashadenchurch, markstrong, ciaránhinds, dominicwest, jamespurefoy, bryancranston, pollywalker, darylsabara, arkiereece, davoodghadami, pippanixon, jamesembree, philipphilmar, emilytierney, edmundkente, nicholaswoodeson, kyleagnew, donstark, joshdaugherty, jaredcyr, christophergoodman, amandaclayton, josephbillingiere]",[andrewstanton],"[carter, former, military, captain, reluctantly, becomes, embroiled, john, inexplicably, transported, exotic, planet, barsoom, epic, conflict, rediscovers, people, rest, war, weary, mysterious, mar, world, brink, collapse, humanity, realizes, survival, hand]",[2012],[en]


## Vectorizer: TF-IDF (Word Embeddings)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TF-IDF vectorizer
# ngram_range: default to unigrams
tf = TfidfVectorizer()

In [59]:
# create dataframe without the movie titles
df = clean1_df.drop('title', axis=1)

# convert list elements to strings
df = df.applymap(lambda x: ' '.join(x))
df.head()

Unnamed: 0,genres_list,keywords_list,prod_companies_list,cast_list,director_list,overview_key_list,year,language
0,action adventure fantasy sciencefiction,cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindsoul 3d,ingeniousfilmpartners twentiethcenturyfoxfilmcorporation duneentertainment lightstormentertainment,samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez giovanniribisi joeldavidmoore cchpounder wesstudi lazalonso dileeprao mattgerald seananthonymoran jasonwhyte scottlawrence kellykilgour jamespatrickpitt seanpatrickmurphy peterdillon kevindorman kelsonhenderson davidvanhorn jacobtomuri michaelblainrozgay joncurry lukehawker woodyschultz petermensah soniayee jahnelcurfman ilramchoi kylawarren lisaroumain debrawilson chrismala taylorkibby jodielandau julielamm cullenbmadden josephbradymadden frankietorres austinwilson sarawilson tamicawashingtonmiller lucybriant nathanmeister gerryblair matthewchamberlain paulyates wraywilson jamesgaylyn melvinlenoclarkiii carvonfutrell brandonjelkes micahmoch hanniyahmuhammad christophernolen christaoliver aprilmariethomas bravitaathreatt colinbleasdale mikebodnar mattclayton nicoledionne jamieharrison allanhenry anthonyingruber ashleyjeffery deanknowsley josephmikahunt terrynotary kaipantano loganpithyou stuartpollock raja garethruck rhiansheehan tjstorm jodietaylor aliciavelabailey richardwhiteside nikiezambo julenerenee,jamescameron,22nd century paraplegic marine moon pandora unique mission becomes torn following order alien civilization dispatched protecting,2009,en
1,adventure fantasy action,ocean drugabuse exoticisland eastindiatradingcompany loveoneslife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger,waltdisneypictures jerrybruckheimerfilms secondmateproductions,johnnydepp orlandobloom keiraknightley stellanskarsgård chowyunfat billnighy geoffreyrush jackdavenport kevinmcnally tomhollander naomieharris jonathanpryce keithrichards leearenberg mackenziecrook gregellis davidbailie martinklebba davidschofield laurenmaher vanessabranch angusbarnett gilesnew reggielee dominicscottkay takayofischer davidmeunier hokwantse andybeckwith peterdonaldbadalamentiii christopherscapp keithrichards hakeemkaekazim ghassanmassoud,goreverbinski,captain barbossa long believed come back elizabeth swann dead life headed edge earth turner nothing quite seems,2007,en
2,action adventure crime,spy basednovel secretagent sequel mi6 britishsecretservice unitedkingdom,columbiapictures danjaq b24,danielcraig christophwaltz léaseydoux ralphfiennes monicabellucci benwhishaw naomieharris davebautista andrewscott rorykinnear jesperchristensen alessandrocremona stephaniesigman tenochhuerta adrianapaz domenicofortunato marcozingaro stefanoelfidiclaudia ianbonar tamwilliams richardbanham pipcarter simonlenagan alessandrobressanello marczinga brigittemillar adelbencherif gediminasadomaitis peppelanzetta francescoarca matteotaranto emilioaniba benitosagredo daitabuchi georgelasha sargonyelda andycheung erickhayden olegmirochnikov antoniosalines miloudmouradbenamara gidoschimanski nigelbarber patricenaiambana stephanecornicard garyfannin sadaoueda philliplaw waiwong josephbalderrama eijimihara junichikajioka victorschefé haraldwindisch tristanmatthiae detlefbothe bodofriesecke wilhemiben noemikrausz noahsaavedra francisattakpah michaelglantschnig marlonboess mariewohlmuth liliepply konstantingerlach laraparmiani umitulgen amramallassi ziadabaza walidmumuni derekhorsham nariblairmangat michaelwhite adammcgrady naderdernaika pezhmaanalinia judidench nevegachev karlfarrer kimadis maurisaselenecoleman matijamondimatović,sammendes,bond terrible truth behind spectre battle political force secret service alive peel back cryptic message ’ past sends sinister organization trail uncover keep layer deceit reveal,2015,en
3,action crime drama thriller,dccomics crimefighter terrorist secretidentity burglar hostagedrama timebomb gothamcity vigilante coverup superhero villainess tragichero terrorism destruction catwoman catburglar imax flood criminalunderworld batman,legendarypictures warnerbros dcentertainment syncopy,christianbale michaelcaine garyoldman annehathaway tomhardy marioncotillard josephgordonlevitt morganfreeman cillianmurphy junotemple liamneeson matthewmodine alonaboutboul benmendelsohn nestorcarbonell joshpence tomconti joeyking warrenbrown danielsunjata samkennard aliashtepina nickjulian mirandanolan clairejulien aidangillen burngorman brettcullen reggielee josephlyletaylor chrisellis duanehenry jamesharveyward gonzalomenendez cameronjack lexdaniel tylerdeanflores thomaslennon trevorwhite robbrown fredriclehne courtneymunch chrishill travisguba jaybenedict willestes daviddayanfisher glenpowell russfega andresperezmolina brentbriscoe johnnolan olivercotton markkilleen sarahgoldberg johnmacmillan robertwisdom ronniegeneblevins johnhollingworth ianbohen urigavriel noelgugliemi hectoratreyuruiz patrickcox aramisknight joshstewart williamdevane davidgyasi patrickjordan joshuaelijahreese desmondharrington mychaelbates tomasarana peterholden davidmonahan jillianarmenante ajaevans aldousdavidson wadewilliams jakecanuso massifurlan christopherjudge patrickleahy toddgearhart marcabbink isiahadams charliealejandro robertarensen grantbabbitt fileenabahris rickbolander kylepatrickbrennan scottchurchson billcowher grahamcurry stephaniedominiehlert johnfarrer frankfata christopherbryangomez vitograssi michaelwrengucciardo samharris edheavey johnwiwanonkiw cindyjackson danieljordano williamkania tiffanykemp sunjaekim hrvojeklecz alexkruz tylerlamarr lejon pauljudeletersky joelipari silvialombardo jorgemardel calemcconnell daviddalemccue tiffanysandermckenzie allenmerritt olanmontgomery alexmoore shanenolan jacksonnunn josephobrien michaelpapajohn salomonpassariello kyledavidpierce troypolamalu michaelpower jamesrawlings kirstenroeters benroethlisberger markroman anthonyjsacco ericsalazar emilyschooley thomastull chrisvaina vincentvanommen michellevezzani barbaravincent justinmichaelwoods jasonyee mariazambrana johnzion alexziwak tommybayiokos matthewgooley jeffmoffitt markfalvo diogohausen garysievers orionmccabe londonmay jamesquinn danielnewman,christophernolan,attorney gotham dent batman city district harvey police department assumes responsibility eight year later mysterious selina kyle new terrorist leader dark knight resurfaces protect late subsequently hunted encounter villainous bane,2012,en
4,action adventure sciencefiction,basednovel mars medallion spacetravel princess alien steampunk martian escape edgarriceburroughs alienrace superhumanstrength marscivilization swordplanet 19thcentury 3d,waltdisneypictures,taylorkitsch lynncollins samanthamorton willemdafoe thomashadenchurch markstrong ciaránhinds dominicwest jamespurefoy bryancranston pollywalker darylsabara arkiereece davoodghadami pippanixon jamesembree philipphilmar emilytierney edmundkente nicholaswoodeson kyleagnew donstark joshdaugherty jaredcyr christophergoodman amandaclayton josephbillingiere,andrewstanton,carter former military captain reluctantly becomes embroiled john inexplicably transported exotic planet barsoom epic conflict rediscovers people rest war weary mysterious mar world brink collapse humanity realizes survival hand,2012,en


In [60]:
# merge the strings into a single string for each row
merge = df.apply(lambda x: ' '.join(x), axis=1)
merge.head()

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [61]:
# feed into the vectorizer

tfidf_matrix = tf.fit_transform(merge)
tfidf_matrix

<4803x84893 sparse matrix of type '<class 'numpy.float64'>'
	with 288922 stored elements in Compressed Sparse Row format>

## Distance Function

In [62]:
from sklearn.metrics.pairwise import cosine_similarity


cosine_similarities = cosine_similarity(tfidf_matrix)


In [63]:
cosine_similarities.shape

(4803, 4803)

In [64]:
clean1_df.set_index('title', inplace = True)

In [65]:
movie_indices = pd.Series(clean1_df.index)

## Creating the Movie Recommender

In [66]:
def get_movie_recommendation(name, cosine_similarities=cosine_similarities):
    
    recommended_movies = []
    
    # Get index of the movie that matches the name
    movie_index = movie_indices[movie_indices == name].index[0]

    # Create a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[movie_index]).sort_values(ascending = False)

    # Get the indexes of the 5 most similar movies except itself
    top_indexes = list(score_series.iloc[1:6].index)
    
    # Populate the list with the names of the top 5 matching hotels
    for i in top_indexes:
        recommended_movies.append(list(clean1_df.index)[i])

    return recommended_movies

## Testing the Recommender

In [67]:
get_movie_recommendation('The Dark Knight')

['Batman Returns',
 'Batman Begins',
 'The Dark Knight Rises',
 'Batman',
 'Batman: The Dark Knight Returns, Part 2']

In [68]:
get_movie_recommendation('The Shawshank Redemption')

['The Green Mile', 'The Majestic', 'The Mist', 'Pulp Fiction', 'Mean Streets']

In [69]:
get_movie_recommendation('Frozen')

['Wreck-It Ralph',
 'Big Hero 6',
 'Enchanted',
 'The Princess and the Frog',
 'Mr. Peabody & Sherman']