In [2]:
%matplotlib inline
from IPython.display import Image, HTML
import json
import datetime
import ast
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from wordcloud import WordCloud, STOPWORDS
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
warnings.filterwarnings('ignore')


sns.set_style('whitegrid')
sns.set(font_scale=1.25)
pd.set_option('display.max_colwidth', 50)

In [3]:
df = pd.read_csv('movies.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
id,505642,315162,646389,956101,536554
title,Black Panther: Wakanda Forever,Puss in Boots: The Last Wish,Plane,The Eighth Clause,M3GAN
genres,Action-Adventure-Science Fiction,Animation-Adventure-Comedy,Action-Adventure-Thriller,Thriller,Science Fiction-Horror-Comedy
original_language,en,en,en,la,en
overview,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,Puss in Boots discovers that his passion for a...,After a heroic job of successfully landing his...,Kat and Borja appear to be a perfect couple bu...,A brilliant toy company roboticist uses artifi...
popularity,3952.862,3351.139,3337.265,2259.303,1836.162
production_companies,Marvel Studios,Universal Pictures-DreamWorks Animation,Di Bonaventura Pictures-MadRiver Pictures-Rive...,SDB Films-El Hombre Orquesta,Universal Pictures-Blumhouse Productions-Atomi...
release_date,2022-11-09,2022-12-07,2023-01-12,2022-04-29,2022-12-28
budget,250000000.0,90000000.0,25000000.0,0.0,12000000.0
revenue,855099029.0,442000000.0,46000000.0,0.0,167643991.0


In [4]:
df.columns



Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 'runtime',
       'status', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords', 'poster_path', 'backdrop_path', 'recommendations'],
      dtype='object')

In [5]:
df.shape

(724194, 20)

In [6]:
df = df.drop(['id','recommendations','backdrop_path','status','tagline'], axis=1)


In [8]:
df = df.drop(['keywords'], axis=1)

In [9]:
df.shape

(724194, 14)

In [10]:
df = df.dropna()

In [11]:
df.shape

(215997, 14)

In [13]:
df[df['revenue'] == 0].shape


(201632, 14)

In [14]:
df['revenue'] = df['revenue'].replace(0, np.nan)


In [15]:
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['budget'] = df['budget'].replace(0, np.nan)
df[df['budget'].isnull()].shape

(193747, 14)

In [16]:
df['return'] = df['revenue'] / df['budget']
df[df['return'].isnull()].shape

(206513, 15)

In [22]:
df['runtime'] = df['runtime'].replace(0, np.nan)

In [23]:
df = df.dropna()

In [33]:
# Look at APPLICATION_TYPE value counts for binning
application_counts = application_df["APPLICATION_TYPE"].value_counts()
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_counts[application_counts<500].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

NameError: name 'application_df' is not defined

Model

In [26]:
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


In [27]:
base_poster_url = 'http://image.tmdb.org/t/p/w185/'
df['poster_path'] = "<img src='" + base_poster_url + df['poster_path'] + "' style='height:100px;'>"

In [28]:
df['title'] = df['title'].astype('str')
df['overview'] = df['overview'].astype('str')

In [None]:
title_corpus = ' '.join(df['title'])
overview_corpus = ' '.join(df['overview'])

In [None]:
title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(title_corpus)
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

In [None]:
overview_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(overview_corpus)
plt.figure(figsize=(16,8))
plt.imshow(overview_wordcloud)
plt.axis('off')
plt.show()

In [None]:
def clean_numeric(x):
    try:
        return float(x)
    except:
        return np.nan

In [None]:
df['popularity'] = df['popularity'].apply(clean_numeric).astype('float')
df['vote_count'] = df['vote_count'].apply(clean_numeric).astype('float')
df['vote_average'] = df['vote_average'].apply(clean_numeric).astype('float')

In [None]:
df['popularity'].describe()


In [None]:
sns.distplot(df['popularity'].fillna(df['popularity'].median()))
plt.show()

In [None]:
df['popularity'].plot(logy=True, kind='hist')


In [None]:
df[['title', 'popularity', 'year']].sort_values('popularity', ascending=False).head(10)


In [None]:
df['vote_count'].describe()


In [None]:
df[['title', 'vote_count', 'year']].sort_values('vote_count', ascending=False).head(10)


In [None]:
df['vote_average'] = df['vote_average'].replace(0, np.nan)
df['vote_average'].describe()

In [None]:
sns.distplot(df['vote_average'].fillna(df['vote_average'].median()))


In [None]:
df[df['vote_count'] > 2000][['title', 'vote_average', 'vote_count' ,'year']].sort_values('vote_average', ascending=False).head(10)


In [None]:
sns.jointplot(x='vote_average', y='popularity', data=df)


In [None]:
sns.jointplot(x='vote_average', y='vote_count', data=df)


In [None]:
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

In [None]:
def get_month(x):
    try:
        return month_order[int(str(x).split('-')[1]) - 1]
    except:
        return np.nan


In [None]:
def get_day(x):
    try:
        year, month, day = (int(i) for i in x.split('-'))    
        answer = datetime.date(year, month, day).weekday()
        return day_order[answer]
    except:
        return np.nan

In [None]:
df['day'] = df['release_date'].apply(get_day)
df['month'] = df['release_date'].apply(get_month)

In [None]:
plt.figure(figsize=(12,6))
plt.title("Number of Movies released in a particular month.")
sns.countplot(x='month', data=df, order=month_order)

In [None]:
month_mean = pd.DataFrame(df[df['revenue'] > 1e8].groupby('month')['revenue'].mean())
month_mean['mon'] = month_mean.index
plt.figure(figsize=(12,6))
plt.title("Average Gross by the Month for Blockbuster Movies")
sns.barplot(x='mon', y='revenue', data=month_mean, order=month_order)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(15, 8))
sns.boxplot(x='month', y='return', data=df[df['return'].notnull()], palette="muted", ax =ax, order=month_order)
ax.set_ylim([0, 12])

In [None]:
plt.figure(figsize=(10,5))
plt.title("Number of Movies released on a particular day.")
sns.countplot(x='day', data=df, order=day_order)

In [None]:
year_count = df.groupby('year')['title'].count()
plt.figure(figsize=(18,5))
year_count.plot()

In [None]:
df[df['year'] != 'NaT'][['title', 'year']].sort_values('year').head(10)


In [None]:
months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}


In [None]:
df_21 = df.copy()
df_21['year'] = df_21[df_21['year'] != 'NaT']['year'].astype(int)
df_21 = df_21[df_21['year'] >=2000]
hmap_21 = pd.pivot_table(data=df_21, index='month', columns='year', aggfunc='count', values='title')
hmap_21 = hmap_21.fillna(0)


In [None]:
sns.set(font_scale=1)
f, ax = plt.subplots(figsize=(16, 8))
sns.heatmap(hmap_21, annot=True, linewidths=.5, ax=ax, fmt='n', yticklabels=month_order)

In [None]:
sns.set(font_scale=1.25)


In [None]:
df['runtime'].describe()


In [None]:
df['runtime'] = df['runtime'].astype('float')


In [None]:
plt.figure(figsize=(12,6))
sns.distplot(df[(df['runtime'] < 300) & (df['runtime'] > 0)]['runtime'])

In [None]:
df_mat = df[(df['return'].notnull()) & (df['runtime'] > 0) & (df['return'] < 10)]
sns.jointplot('return', 'runtime', data=df_mat)
plt.show()


In [None]:
df_mat = df[(df['budget'].notnull()) & (df['runtime'] > 0)]
sns.jointplot('budget', 'runtime', data=df_mat)
plt.show()

In [None]:
plt.figure(figsize=(18,5))
year_runtime = df[df['year'] != 'NaT'].groupby('year')['runtime'].mean()
plt.plot(year_runtime.index, year_runtime)
plt.xticks(np.arange(1874, 2024, 10.0))
plt.show()

In [None]:
df[df['runtime'] > 0][['runtime', 'title', 'year']].sort_values('runtime').head(10)


In [None]:
df[df['runtime'] > 0][['runtime', 'title', 'year']].sort_values('runtime', ascending=False).head(10)

In [None]:
df['budget'].describe()


In [None]:
sns.distplot(df[df['budget'].notnull()]['budget'])


In [None]:
df['budget'].plot(logy=True, kind='hist')


In [None]:
df[df['budget'].notnull()][['title', 'budget', 'revenue', 'return', 'year']].sort_values('budget', ascending=False).head(10)


In [None]:
sns.jointplot(x='budget',y='revenue',data=df[df['return'].notnull()])


In [None]:
df['revenue'].describe()


In [None]:
sns.distplot(df[df['revenue'].notnull()]['revenue'])


In [None]:
gross_top = df[['poster_path', 'title', 'budget', 'revenue', 'year']].sort_values('revenue', ascending=False).head(10)
pd.set_option('display.max_colwidth', 100)
HTML(gross_top.to_html(escape=False))

In [None]:
pd.set_option('display.max_colwidth', 50)


In [None]:
plt.figure(figsize=(18,5))
year_revenue = df[(df['revenue'].notnull()) & (df['year'] != 'NaT')].groupby('year')['revenue'].max()
plt.plot(year_revenue.index, year_revenue)
plt.show()

In [None]:
df[(df['return'].notnull()) & (df['budget'] > 5e6)][['title', 'budget', 'revenue', 'return', 'year']].sort_values('return', ascending=False).head(10)

In [None]:
df[(df['return'].notnull()) & (df['budget'] > 5e6) & (df['revenue'] > 10000)][['title', 'budget', 'revenue', 'return', 'year']].sort_values('return').head(10)

In [None]:
df['year'] = df['year'].replace('NaT', np.nan)


In [None]:
df['year'] = df['year'].apply(clean_numeric)


In [None]:
sns.set(font_scale=1)
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    plt.figure(figsize=(9,9))
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True, annot=True)

In [None]:
sns.set(font_scale=1.25)
