In [20]:
import numpy as np
import pandas as pd

In [21]:
# importing movie-metadata
data = pd.read_csv('movie_metadata.csv')

In [22]:
# taking a look at data
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [23]:
data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [24]:
#Selecting features suitable for recommender system
data = data.loc[:, ['color', 'director_name', 'movie_facebook_likes', 'plot_keywords', 'genres', 'actor_1_name', 'imdb_score', 'movie_title']]

In [25]:
data.head()

Unnamed: 0,color,director_name,movie_facebook_likes,plot_keywords,genres,actor_1_name,imdb_score,movie_title
0,Color,James Cameron,33000,avatar|future|marine|native|paraplegic,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,7.9,Avatar
1,Color,Gore Verbinski,0,goddess|marriage ceremony|marriage proposal|pi...,Action|Adventure|Fantasy,Johnny Depp,7.1,Pirates of the Caribbean: At World's End
2,Color,Sam Mendes,85000,bomb|espionage|sequel|spy|terrorist,Action|Adventure|Thriller,Christoph Waltz,6.8,Spectre
3,Color,Christopher Nolan,164000,deception|imprisonment|lawlessness|police offi...,Action|Thriller,Tom Hardy,8.5,The Dark Knight Rises
4,,Doug Walker,0,,Documentary,Doug Walker,7.1,Star Wars: Episode VII - The Force Awakens ...


In [26]:
# Checking for null values
data.isnull().sum(axis = 0)

color                    19
director_name           104
movie_facebook_likes      0
plot_keywords           153
genres                    0
actor_1_name              7
imdb_score                0
movie_title               0
dtype: int64

In [27]:
# Filling all null values with default value for string columns and mean(column) for numerical columns
data.color = data.color.replace(np.nan, 'unknown')
data.director_name = data.director_name.replace(np.nan, 'unknown')
data.actor_1_name = data.actor_1_name.replace(np.nan, 'unknown')
data.plot_keywords = data.plot_keywords.replace(np.nan, 'unknown')

In [31]:
# replacing '|' with space so that tokenizing would be easy
data.genres = data.genres.str.replace('|', ' ')
data['plot_keywords'] = data['plot_keywords'].str.replace('|', ' ')

In [32]:
data['genres'][:7]

0    Action Adventure Fantasy Sci-Fi
1           Action Adventure Fantasy
2          Action Adventure Thriller
3                    Action Thriller
4                        Documentary
5            Action Adventure Sci-Fi
6           Action Adventure Romance
Name: genres, dtype: object

In [33]:
data['plot_keywords'][:7]

0               avatar future marine native paraplegic
1    goddess marriage ceremony marriage proposal pi...
2                  bomb espionage sequel spy terrorist
3    deception imprisonment lawlessness police offi...
4                                              unknown
5    alien american civil war male nipple mars prin...
6            sandman spider man symbiote venom villain
Name: plot_keywords, dtype: object

In [34]:
data.movie_title = data.movie_title.str.lower()

In [35]:
data.movie_title[1], data.movie_title[2], data.movie_title[3]

("pirates of the caribbean: at world's end",
 'spectre\xa0',
 'the dark knight rises\xa0')

In [36]:
# Removing special character added at end of movei_title
# Since i manually removed for 1st two movie_title while understaing data, so
data.movie_title[2:] = data.movie_title[2:].str[:-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
data.movie_title[1], data.movie_title[2], data.movie_title[3]

("pirates of the caribbean: at world's end",
 'spectre',
 'the dark knight rises')

In [38]:
# saving the extracted data into a csv file without index column
data.to_csv('movie_data.csv', index=False)