Import the necessary Python libraries

In [2]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline
sns.set_style('whitegrid')

Load movies csv file and perform two tasks while loading:
1. Convert date field to datetime.date type
2. Convert all column with json data as json type

In [7]:
def load_movies(file_path):
    df = pd.read_csv(file_path)
    # covert each item of release_date to datetime.date type entity
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [133]:
movies = load_movies("../tmdb_5000_movies.csv")

See number of rows and columns in the loaded data

In [33]:
movies.shape

(4803, 20)

Print column names

In [34]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [14]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
budget                  4803 non-null int64
genres                  4803 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4803 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null object
tagline                 3959 non-null object
title                   4803 non-null object
vote_average            4803 non-null float64
vote_count              4803 non-null 

draw one sample from data

In [12]:
movies.sample(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
2526,15000000,"[{'id': 27, 'name': 'Horror'}]",http://www.mamamovie.com/,132232,"[{'id': 185408, 'name': 'supernatural horror'}]",en,Mama,"Guillermo del Toro presents Mama, a supernatur...",47.455148,"[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'ES', 'name': 'Spain'}, {'iso_...",2013-01-17,146497771,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A Mother's Love is Forever,Mama,6.0,1401
302,80000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",http://legendoftheguardians.warnerbros.com/,41216,"[{'id': 3905, 'name': 'owl'}]",en,Legend of the Guardians: The Owls of Ga'Hoole,"Soren, a young barn owl, is kidnapped by owls ...",37.321848,"[{'name': 'Village Roadshow Pictures', 'id': 7...","[{'iso_3166_1': 'AU', 'name': 'Australia'}, {'...",2010-07-10,140073390,97.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,On his way to finding a legend...he will becom...,Legend of the Guardians: The Owls of Ga'Hoole,6.5,703
3024,10000000,"[{'id': 878, 'name': 'Science Fiction'}]",,42684,"[{'id': 2534, 'name': 'missile'}, {'id': 3760,...",en,Skyline,When strange lights descend on the city of Los...,26.268209,"[{'name': 'Rogue Pictures', 'id': 134}, {'name...","[{'iso_3166_1': 'US', 'name': 'United States o...",2010-11-11,66821036,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Don't look up,Skyline,4.7,507


Let's see how many observations are null for each column

In [10]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

homepage and tagline have maximum missing entries, this should not be much of a problem as these fields might not be required for predictive model

3 values in overview column are missing, let's print out those 3 observations

In [54]:
movies[movies['overview'].isnull()]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
2656,15000000,"[{'id': 18, 'name': 'Drama'}]",,370980,"[{'id': 717, 'name': 'pope'}, {'id': 5565, 'na...",it,Chiamatemi Francesco - Il Papa della gente,,0.738646,"[{'name': 'Taodue Film', 'id': 45724}]","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2015-12-03,0,,"[{'iso_639_1': 'es', 'name': 'Español'}]",Released,,Chiamatemi Francesco - Il Papa della gente,7.3,12
4140,2,"[{'id': 99, 'name': 'Documentary'}]",,459488,"[{'id': 6027, 'name': 'music'}, {'id': 225822,...",en,"To Be Frank, Sinatra at 100",,0.050625,"[{'name': 'Eyeline Entertainment', 'id': 60343}]","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]",2015-12-12,0,,[],Released,,"To Be Frank, Sinatra at 100",0.0,0
4431,913000,"[{'id': 99, 'name': 'Documentary'}]",,292539,[],de,Food Chains,,0.795698,[],[],2014-04-26,0,83.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Food Chains,7.4,8


Let's fill out those overview fields by corrseponding movie plot outline fetched from imdb

In [59]:
from imdb import IMDb
ia = IMDb()

In [98]:
for idx in movies[movies['overview'].isnull()].index:
    plot = ia.get_movie(ia.search_movie(movies.iloc[idx][6])[0].getID())['plot']
    movies.iloc[idx,7] = plot[0]

Using similar approach we will fill out release and runtime info by fetching corresponding content from imdb

In [106]:
for idx in movies[movies['release_date'].isnull()].index:
    year = ia.get_movie(ia.search_movie(movies.iloc[idx][6])[0].getID())['year']
    movies.iloc[idx,11] = year

In [115]:
for idx in movies[movies['runtime'].isnull()].index:
    runtime = ia.get_movie(ia.search_movie(movies.iloc[idx][6])[0].getID())['runtimes']
    movies.iloc[idx,13] = runtime[0]

In [None]:
for idx in movies[movies['tagline'].isnull()].index:
    try:
        runtime = ia.get_movie(ia.search_movie(movies.iloc[idx][6])[0].getID())['plot outline']
        movies.iloc[idx,13] = runtime[0]
    except:
        pass

In [None]:
movies.isnull().sum()