In [None]:
#trainset = data.build_full_trainset()

# Data Dictionary

|Feature|Type|Dataset|Description|
|---|---|---|---|
|belongs_to_collection|object|movies_metadata.csv|Movie Series a movie belongs to|
|budget|float|movies_metadata.csv|Budget of the movie in dollars|
|genres|object|movies_metadata.csv|Genres associated with the movie|
|id|integer|movies_metadata.csv|ID of movie|
|original_language|object|movies_metadata.csv|Language which the movie was shot in|
|overview|object|movies_metadata.csv|Brief Description of the movie|
|popularity|float|movies_metadata.csv|Popularity score assigned by TMDB|
|production_companies|object|movies_metadata.csv|Production Companies involved with making of the movie|
|production_countries|object|movies_metadata.csv|Countries where the movie was shot/produced in|
|release_date|object|movies_metadata.csv|Theatrical Release Date of the movie|
|revenue|float|movies_metadata.csv|Total Revenue of the movie in dollars|
|runtime|float|movies_metadata.csv|Runtime of the movie in minutes|
|spoken_languages|object|movies_metadata.csv|Spoken Languages in the movie|
|status|object|movies_metadata.csv|Status of the movie|
|tagline|object|movies_metadata.csv|Tagline of the movie|
|title|object|movies_metadata.csv|Official Title of the movie|
|vote_average|float|movies_metadata.csv|Average Rating of the movie|
|vote_count|float|movies_metadata.csv|Number of votes by users, as counted by TMDB|
|release_year|object|movies_metadata.csv|Theatrical Release Year of the movie|
|release_month|object|movies_metadata.csv|Theatrical Release Month of the movie|
|release_weekday|object|movies_metadata.csv|Theatrical Release Weekday of the movie|
|profit_loss|float|movies_metadata.csv|Revenue/Budget|

In [2]:
#Import libraries
import pandas as pd
import numpy as np
import ast
import datetime
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import plotly.offline as py
import seaborn as sns

#Suppress Warnings
import warnings
warnings.filterwarnings('ignore')

#Adjust the max columns displayed in the df
pd.set_option("display.max_columns", 25)

In [3]:
#Load data
df = pd.read_csv('movies_metadata_clean.csv')

# EDA

## title, overview and tagline wordcloud

In [4]:
df['title'] = df['title'].astype('str')
df['overview'] = df['overview'].astype('str')
df['tagline'] = df['tagline'].astype('str')

In [None]:
title_corpus = ' '.join(df['title'])
overview_corpus = ' '.join(df['overview'])
tagline_corpus = ' '.join(df['tagline'])

Most common words from movie titles are Love, Girl, Man which kinda shows that romance is probably a popular genre. Also, words like death, night, dead blood > probably for horror/thriller genre.

In [None]:
title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(title_corpus)
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

themes and genres of movies are shown more clearly here. theme revolving life, love, family, world

In [None]:
overview_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(overview_corpus)
plt.figure(figsize=(16,8))
plt.imshow(overview_wordcloud)
plt.axis('off')
plt.show()

tagline wordcloud - 

In [None]:
tagline_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(tagline_corpus)
plt.figure(figsize=(16,8))
plt.imshow(tagline_wordcloud)
plt.axis('off')
plt.show()

Production countries - most popular locations to shoot movies

In [None]:
#to extract the production countries from list of dictionaries to a readable column in the df

df["production_countries"]

In [None]:
countries_df = df

#fill null values with empty list []
#string(object type for production_countries) which contains dictionaries. ast.literal_eval evaluates the string and output as dictionary
countries_df['production_countries'] = df['production_countries'].fillna('[]').apply(ast.literal_eval)

#extract the country name from the dictionary list
countries_df['production_countries'] = countries_df['production_countries'].apply(lambda x: i['name'] for i in x if isinstance(x, list) else np.nan)

In [None]:
countries_df['production_countries']

In [None]:
countries = countries_df.apply(lambda x: pd.Series(x['production_countries']),axis=1).stack().reset_index(level=1, drop=True)
countries.name = 'countries'

In [None]:
countries

In [None]:
countries_df = countries_df.drop('production_countries', axis=1).join(countries)
countries_df = pd.DataFrame(countries_df['countries'].value_counts())
countries_df['country'] = countries_df.index
countries_df.columns = ['num_movies', 'country']
countries_df = countries_df.reset_index().drop('index', axis=1)
countries_df.head(10)

popular fliming location include US, Europe, Canada. Japan and India falls in top 10 for list of Asia countries

In [None]:
data = [ dict(
        type = 'choropleth',
        locations = countries_df['country'],
        locationmode = 'country names',
        z = countries_df['num_movies'],
        text = countries_df['country'],
        colorscale = [[0,'rgb(255, 255, 255)'],[1,'rgb(0, 0, 255)']],
        autocolorscale = False,
        reversescale = False,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = False,
            tickprefix = '',
            title = 'Count'),
      ) ]

layout = dict(
    title = 'Production Countries for the TMDB Movies',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='d3-world-map' )

franchise movie

In [None]:
df['belongs_to_collection']

In [None]:
franchise_df = df[df['belongs_to_collection'].notnull()]
franchise_df['belongs_to_collection'] = franchise_df['belongs_to_collection'].apply(ast.literal_eval).apply(lambda x: x['name'] if isinstance(x, dict) else np.nan)
franchise_df = franchise_df[franchise_df['belongs_to_collection'].notnull()]

In [None]:
franchise_pivot = franchise_df.pivot_table(index='belongs_to_collection', values='revenue', aggfunc={'revenue': ['mean', 'sum', 'count']}).reset_index()

In [None]:
franchise_pivot.sort_values('sum', ascending=False).head(10)

In [None]:
franchise_pivot.sort_values('mean', ascending=False).head(10)

In [None]:
franchise_pivot.sort_values('count', ascending=False).head(10)

Production companies

In [None]:
companies_df = df

companies_df['production_companies'] = df['production_companies'].fillna('[]').apply(ast.literal_eval)
companies_df['production_companies'] = companies_df['production_companies'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
companies = companies_df.apply(lambda x: pd.Series(x['production_companies']),axis=1).stack().reset_index(level=1, drop=True)
companies.name = 'companies'

In [None]:
companies_df = companies_df.drop('production_companies', axis=1).join(companies)

In [None]:
companies_sum = pd.DataFrame(companies_df.groupby('companies')['revenue'].sum().sort_values(ascending=False))
companies_sum.columns = ['Total']
companies_mean = pd.DataFrame(companies_df.groupby('companies')['revenue'].mean().sort_values(ascending=False))
companies_mean.columns = ['Average']
companies_count = pd.DataFrame(companies_df.groupby('companies')['revenue'].count().sort_values(ascending=False))
companies_count.columns = ['Number']

companies_pivot = pd.concat((companies_sum, companies_mean, companies_count), axis=1)

In [None]:
#highest earning production company

In [None]:
companies_pivot.sort_values('Total', ascending=False).head(10)

original language

In [None]:
#all languages

df['original_language'].unique()

In [None]:
#90 unique languages

len(df['original_language'].unique())

English obviously the top language

In [None]:
language_df = pd.DataFrame(df['original_language'].value_counts())
language_df['language'] = language_df.index
language_df.columns = ['number', 'language']
language_df.head()

Take a look at the other top languages - france, italy. japan and hindi top asian languages

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(x='language', y='number', data=language_df.iloc[1:11])
plt.show()

popularity

In [None]:
df[['title', 'popularity', 'release_year']].sort_values('popularity', ascending=False).head(10)

In [None]:
#positively skewed

df['popularity'].describe()

In [None]:
#vote count only 1. To include filter condition for vote_count

df[['title', 'vote_average', 'vote_count' ,'release_year']].sort_values('vote_average', ascending=False).head(10)

In [None]:
#majority of # of votes is less than 50

df['vote_count'].describe()

In [None]:
df[df['vote_count'] > 2000][['title', 'vote_average', 'vote_count' ,'release_year']].sort_values('vote_average', ascending=False).head(10)

In [None]:
df['vote_average'].describe()

In [None]:
#large % with rating of 6
sns.distplot(df['vote_average'])

In [None]:
sns.distplot(df['popularity'])

In [None]:
sns.distplot(df['vote_count'])

In [None]:
df['spoken_languages']

In [None]:
spoken_languages_df = df

#fill null values with empty list []
#string(object type for production_countries) which contains dictionaries. ast.literal_eval evaluates the string and output as dictionary
spoken_languages_df['spoken_languages'] = df['spoken_languages'].fillna('[]').apply(ast.literal_eval)

#extract the country name from the dictionary list
spoken_languages_df['spoken_languages'] = spoken_languages_df['spoken_languages'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
spoken_languages_df['spoken_languages']

In [None]:
spoken_languages_df['number_of_spoken_languages'] = spoken_languages_df['spoken_languages'].apply(lambda x: len(x))

In [None]:
spoken_languages_df['number_of_spoken_languages']

In [None]:
spoken_languages_df['number_of_spoken_languages'].describe()

In [None]:
#mostly one language

sns.distplot(spoken_languages_df['number_of_spoken_languages'])

In [None]:
spoken_languages_df['number_of_spoken_languages'].value_counts()

In [None]:
#look at languages > 10

spoken_languages_df[spoken_languages_df['number_of_spoken_languages'] >= 10][['title', 'release_year', 'number_of_spoken_languages']].sort_values('number_of_spoken_languages', ascending=False)

Vision of Europe - 25 short films by 25 european directors - more spoken languages

Status

In [None]:
#most movies were released

df['status'].value_counts()

runtime

In [None]:
df['runtime'].describe()

In [None]:
#most movies less than 200 mins

sns.distplot(df['runtime'])

In [None]:
#most movies between 90-100 mins

plt.figure(figsize=(12,6))
sns.distplot(df[(df['runtime'] < 200) & (df['runtime'] > 0)]['runtime'])

In [None]:
df[df['runtime'] > 0][['runtime', 'title', 'release_year']].sort_values('runtime', ascending=False).head(10)

runtime is due to the fact that some of these 'movies' are mini-series instead.

genres

In [None]:
genres_df = df

genres_df['genres'] = df['genres'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
genres = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
genres.name = 'genre'

In [None]:
genres_df = genres_df.drop('genres', axis=1).join(genres)

In [None]:
genres_df['genre'].value_counts()

In [None]:
len(genres_df['genre'].value_counts())

In [None]:
genre_count = pd.DataFrame(genres_df['genre'].value_counts()).reset_index()
genre_count.columns = ['genre', 'count']
genre_count.head(10)

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(x='genre', y='count', data=genre_count.head(10))
plt.show()

how have genre changes across the years?

In [None]:
genre_of_movie = ['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime', 'Documentary', 'Adventure', 'Science Fiction']

In [None]:
def clean_numeric(x):
    try:
        return float(x)
    except:
        return np.nan

In [None]:
genres_df['release_year'] = genres_df['release_year'].apply(clean_numeric).astype('float')

In [None]:
pop_gen_movies = genres_df[(genres_df['genre'].isin(genre_of_movie)) & (genres_df['release_year'] >= 2000) & (genres_df['release_year'] <= 2017)]
ctab = pd.crosstab([pop_gen_movies['release_year']], pop_gen_movies['genre']).apply(lambda x: x/x.sum(), axis=1)

In [None]:
ctab[genre_of_movie].plot(kind='line', stacked=False, colormap='jet', figsize=(12,8)).legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

genre and revenue

In [None]:
genres_df['revenue'].describe()

top 10 genres against revenue

In [None]:
movies_genres = genres_df[(genres_df['genre'].isin(genre_of_movie))]

In [None]:
plt.figure(figsize=(18,8))
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(15, 8))
sns.boxplot(x='genre', y='revenue', data=movies_genres, palette="muted", ax =ax)
ax.set_ylim([0, 3e8])
plt.show()

release_month

In [None]:
plt.figure(figsize=(12,6))
plt.title("Number of Movies released in a particular month.")
sns.countplot(x='release_month', data=df, order=month_order)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(15, 8))
sns.boxplot(x='release_month', y='revenue', data=df[df['revenue'].notnull()], palette="muted", ax =ax, order=month_order)
ax.set_ylim([0, 3e8])

release_day

In [None]:
plt.figure(figsize=(10,5))
plt.title("Number of Movies released on a particular day.")
sns.countplot(x='release_day', data=df, order=day_order)

release_year

In [None]:
year_count = df.groupby('release_year')['title'].count()
plt.figure(figsize=(18,5))
year_count.plot()

budget

In [None]:
df['budget'].describe()

In [None]:
sns.distplot(df[df['budget'].notnull()]['budget'])

In [None]:
df[df['budget'].notnull()][['title', 'budget', 'revenue', 'profit_loss', 'release_year']].sort_values('budget', ascending=False).head(10)

revenue

In [None]:
df['revenue'].describe()

In [None]:
sns.distplot(df[df['revenue'].notnull()]['revenue'])

In [None]:
gross_top = df[['title', 'budget', 'revenue', 'release_year']].sort_values('revenue', ascending=False).head(10)

In [None]:
def clean_numeric(x):
    try:
        return float(x)
    except:
        return np.nan

df['release_year'] = df['release_year'].apply(clean_numeric).astype('float')

In [None]:
#max revenue throughout the years - to adjust for inflation

plt.figure(figsize=(18,5))
year_revenue = df[(df['revenue'].notnull()) & (df['release_year'] != 'NaT')].groupby('release_year')['revenue'].max()
plt.plot(year_revenue.index, year_revenue)
plt.xticks(np.arange(1900, 2018, 10.0))
plt.show()

In [None]:
#most successful movies

df[(df['profit_loss'].notnull()) & (df['budget'] > 5e6)][['title', 'budget', 'revenue', 'profit_loss', 'release_year']].sort_values('profit_loss', ascending=False).head(10)

In [None]:
#worst movies

df[(df['profit_loss'].notnull()) & (df['budget'] > 5e6) & (df['revenue'] > 10000)][['title', 'budget', 'revenue', 'profit_loss', 'release_year']].sort_values('profit_loss').head(10)