# IMDB Dataset

### Import Library and Data

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
movies = pd.read_csv('../input/imdb-extensive-dataset/IMDb movies.csv')

### Take a sneak peek

In [None]:
# movies.info()

In [None]:
# movies.describe(include=['int64', 'float64', 'object'])

In [None]:
msno.matrix(movies)

### Explore data through visualization

In [None]:
# plt.figure(figsize=(8,8))
# sns.pairplot(data=movies)

There seems to be postive correlation between ang_vote(imdb score) and metascore

In [None]:
sns.lmplot(data=movies, x='avg_vote', y='metascore', scatter_kws={'alpha':0.025,'color': 'crimson'})

Distribution of IMDB score and metascore

In [None]:
sns.kdeplot(movies['avg_vote']
            ,shade = True
            )

In [None]:
sns.kdeplot(movies['metascore']
            ,shade = True
            , color='crimson')

In [None]:
print(movies['avg_vote'].mean(), movies['avg_vote'].median())
print(movies['metascore'].mean(), movies['metascore'].median())

# Question

**1. How release date affect gross income and profit(USA and worldwide)?
    - Month
    - Week
    - Day
    - Day of week
2. How release date affect IMDB score and Metascore? income and profit? Score affect profitability?
    - Month
    - Week
    - Day
    - Day of week
3. How release date related to genre (and gross income)?
    - Month
    - Week
    - Day
    - Day of week

### Transforming

Correct the spelling from worlwide to worldwide

In [None]:
movies.columns = ['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worldwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics']

In [None]:
# Drop usa gross income to focus only on worldwide income
movies = movies.loc[:, ['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'worldwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics']]

In [None]:
movies.head(2)

In [None]:
movies['date_published']

There are some inconsistent needed to be clean. In this case there is this one, 'TV movie 2019'

In [None]:
movies['date_published'] = movies['date_published'].str.replace('TV Movie 2019', '2019-01-01')

Convert data type to datetime and extract part of date

In [None]:
movies['date_published'] = pd.to_datetime(movies['date_published'])
movies['date_published']

In [None]:
movies['publish_year'] = movies['date_published'].dt.year

In [None]:
movies['publish_month'] = movies['date_published'].dt.month
# .\
# replace({1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
#         7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'})

In [None]:
movies['publish_week'] = movies['date_published'].dt.isocalendar().week

In [None]:
movies['publish_day'] = movies['date_published'].dt.day

In [None]:
movies['publish_dayofweek']= movies['date_published'].dt.dayofweek
# .\
# replace({0:'Mon', 1:'Tue',2:'Wed', 3:'Thu',4:'Fri', 5:'Sat', 6:'Sun'})

Genre column has more than 1 genre in the some entry. So, I split it, by comma, into genre_1 and genre_2 and ignore all the rest.

In [None]:
movies['genre_1'] = movies['genre'].str.split(',', expand=True)[0]
movies['genre_2'] = movies['genre'].str.split(',', expand=True)[1]

Budget and worldwide income column have doolar sign($) in their entry.

In [None]:
clean_df = movies[(movies['budget'].str.contains('\$\s').fillna(False)) & 
#                   (movies['usa_gross_income'].str.contains('\$\s').fillna(False)) &
                 (movies['worldwide_gross_income'].str.contains('\$\s').fillna(False))]
clean_df.loc[:,['budget'
#                 ,'usa_gross_income'
                ,'worldwide_gross_income']]
# msno.matrix(clean_df)
clean_df['budget'] = clean_df['budget'].str.strip('\$\s').astype('float64')
# clean_df['usa_gross_income'] = clean_df['usa_gross_income'].str.strip('\$\s').astype('float64')
clean_df['worldwide_gross_income'] = clean_df['worldwide_gross_income'].str.strip('\$\s').astype('float64')
clean_df.info()

Data are missing a lot during this process but it necessary because the need to focus on gross income. So, if there are null value, it should be drop out. Budget column contains various currency. So, I decided to use only US dollars.

In [None]:
# percent revenue
# clean_df['usa_percent_revenue'] = clean_df['usa_gross_income']/clean_df['budget']*100
clean_df['ww_percent_revenue'] = clean_df['worldwide_gross_income']/clean_df['budget']*100
clean_df['ww_percent_revenue']

In [None]:
# profit
# clean_df['usa_profit'] = clean_df['usa_gross_income'] - clean_df['budget']
clean_df['ww_profit'] = clean_df['worldwide_gross_income'] - clean_df['budget']
clean_df['ww_profit']

In [None]:
# percent profit
# clean_df['usa_percent_profit'] = (clean_df['usa_gross_income'] - clean_df['budget'])/clean_df['budget']*100
clean_df['ww_percent_profit'] = (clean_df['worldwide_gross_income'] - clean_df['budget'])/clean_df['budget']*100
clean_df['ww_percent_profit']

In [None]:
clean_df['metascore'].fillna(0, inplace=True)

In [None]:
msno.matrix(clean_df)

## 1. How release date affect gross income and profit(USA and worldwide)?

In [None]:
sns.lmplot(data=clean_df, x='budget', y='worldwide_gross_income', height=5, scatter_kws={'alpha':0.25})
sns.lmplot(data=clean_df, x='budget', y='ww_profit', height=5, scatter_kws={'alpha':0.25})

There seems to be positive correlation between film budget and their gross income and profit

In [None]:
# sns.kdeplot(clean_df['usa_gross_income']
#             ,shade = True
#             )
# plt.show()

sns.kdeplot(clean_df['worldwide_gross_income']
            ,shade = True
            )

plt.show()
# sns.kdeplot(clean_df['usa_profit']
#             ,shade = True
#             )
# plt.show()

sns.kdeplot(clean_df['ww_profit']
            ,shade = True
            )

Distribution is very skewed so Mean value might not be the best statistics to use. Use Median value instead.

In [None]:
# Define function for dual plot
def dual_plot_median(data, group, col1, col2):
    df = data
    fig, ax1 = plt.subplots(figsize=(10,5))
    ax1.plot(df.groupby(str(group)).agg({str(col1):'median'}), color='steelblue'
             , label=str(col1))
    ax1.tick_params(axis='y', labelcolor='steelblue')

    ax2 = ax1.twinx()
    ax2.plot(df.groupby(str(group)).agg({str(col2):'median'}), color='crimson'
             , label=str(col2))
    ax2.tick_params(axis='y', labelcolor='crimson')
    fig.legend(loc='upper center')

### Publish Year

In [None]:
# dual_plot_median('publish_year', 'budget', 'usa_gross_income')
# dual_plot_median('publish_year', 'budget', 'usa_profit')
# dual_plot_median('publish_year', 'budget', 'usa_percent_profit')

In [None]:
dual_plot_median(clean_df, 'publish_year', 'budget', 'worldwide_gross_income')
dual_plot_median(clean_df, 'publish_year', 'budget', 'ww_profit')
dual_plot_median(clean_df, 'publish_year', 'budget', 'ww_percent_profit')
dual_plot_median(clean_df, 'publish_year', 'ww_profit', 'ww_percent_profit')

Although Median value of budget is rising, median value of profit remains the same level.

### Publish Month

In [None]:
# dual_plot_median('publish_month', 'budget', 'usa_gross_income')
# dual_plot_median('publish_month', 'budget', 'usa_profit')
# dual_plot_median('publish_month', 'budget', 'usa_percent_profit')
# dual_plot_median('publish_month', 'usa_profit', 'usa_percent_profit')

In [None]:
dual_plot_median(clean_df, 'publish_month', 'budget', 'worldwide_gross_income')
dual_plot_median(clean_df, 'publish_month', 'budget', 'ww_profit')
dual_plot_median(clean_df, 'publish_month', 'budget', 'ww_percent_profit')
dual_plot_median(clean_df, 'publish_month', 'ww_profit', 'ww_percent_profit')

In [None]:
clean_df.groupby('publish_month').agg({'budget':'median'}).sort_values(by='budget', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_month').agg({'worldwide_gross_income':'median'})\
.sort_values(by='worldwide_gross_income', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_month').agg({'ww_profit':'median'}).sort_values(by='ww_profit', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_month').agg({'ww_percent_profit':'median'}).sort_values(by='ww_percent_profit', ascending=False).head(5)

* Films with high budget tend to release in December, February, May, and October respectively
* Films with high profit tend to release in December and February
* Films wiith high percent profit tend to release in December, February, September, and November respectively

### Publish Week

In [None]:
# dual_plot_median('publish_week', 'budget', 'usa_gross_income')
# dual_plot_median('publish_week', 'budget', 'usa_profit')
# dual_plot_median('publish_week', 'budget', 'usa_percent_profit')
# dual_plot_median('publish_week', 'usa_profit', 'usa_percent_profit')

In [None]:
dual_plot_median(clean_df, 'publish_week', 'budget', 'worldwide_gross_income')
dual_plot_median(clean_df, 'publish_week', 'budget', 'ww_profit')
dual_plot_median(clean_df, 'publish_week', 'budget', 'ww_percent_profit')
dual_plot_median(clean_df, 'publish_week', 'ww_profit', 'ww_percent_profit')

In [None]:
clean_df.groupby('publish_week').agg({'budget':'median'}).sort_values(by='budget', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_week').agg({'worldwide_gross_income':'median'})\
.sort_values(by='worldwide_gross_income', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_week').agg({'ww_profit':'median'}).sort_values(by='ww_profit', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_month').agg({'ww_percent_profit':'median'}).sort_values(by='ww_percent_profit', ascending=False).head(5)

* Films with high budget tend to release in week number 51, 27, and 20 respectively.
* Films with high profit tend to release in week number 51, 7, and 8 respectively.
* Films wiith high percent profit tend to release in week number 12, 2, and 9 respectively.

### Publish Day

In [None]:
# dual_plot_median('publish_day', 'budget', 'usa_gross_income')
# dual_plot_median('publish_day', 'budget', 'usa_profit')
# dual_plot_median('publish_day', 'budget', 'usa_percent_profit')
# dual_plot_median('publish_day', 'usa_profit', 'usa_percent_profit')

In [None]:
dual_plot_median(clean_df, 'publish_day', 'budget', 'worldwide_gross_income')
dual_plot_median(clean_df, 'publish_day', 'budget', 'ww_profit')
dual_plot_median(clean_df, 'publish_day', 'budget', 'ww_percent_profit')
dual_plot_median(clean_df, 'publish_day', 'ww_profit', 'ww_percent_profit')

In [None]:
clean_df.groupby('publish_day').agg({'budget':'median'}).sort_values(by='budget', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_day').agg({'worldwide_gross_income':'median'})\
.sort_values(by='worldwide_gross_income', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_day').agg({'ww_profit':'median'}).sort_values(by='ww_profit', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_day').agg({'ww_percent_profit':'median'}).sort_values(by='ww_percent_profit', ascending=False).head(5)

* Films with high budget tend to release on date 31, 30, and 22 respectively.
* Films with high profit tend to release on date 18, 16, and 14 respectively.
* Films wiith high percent profit tend to release on date 18, 16, and 25 respectively.

### Publish Day of Week

In [None]:
# dual_plot_median('publish_dayofweek', 'budget', 'usa_gross_income')
# dual_plot_median('publish_dayofweek', 'budget', 'usa_profit')
# dual_plot_median('publish_dayofweek', 'budget', 'usa_percent_profit')
# dual_plot_median('publish_dayofweek', 'usa_profit', 'usa_percent_profit')

In [None]:
dual_plot_median(clean_df, 'publish_dayofweek', 'budget', 'worldwide_gross_income')
dual_plot_median(clean_df, 'publish_dayofweek', 'budget', 'ww_profit')
dual_plot_median(clean_df, 'publish_dayofweek', 'budget', 'ww_percent_profit')
dual_plot_median(clean_df, 'publish_dayofweek', 'ww_profit', 'ww_percent_profit')

In [None]:
clean_df.groupby('publish_dayofweek').agg({'budget':'median'}).sort_values(by='budget', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_dayofweek').agg({'worldwide_gross_income':'median'})\
.sort_values(by='worldwide_gross_income', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_dayofweek').agg({'ww_profit':'median'}).sort_values(by='ww_profit', ascending=False).head(5)

In [None]:
clean_df.groupby('publish_dayofweek').agg({'ww_percent_profit':'median'}).sort_values(by='ww_percent_profit', ascending=False).head(5)

* Films with high budget tend to release on Friday, Wednesday, and Thursday respectively.
* Films with high profit tend to release on Wednesday, Thursday, and Friday respectively.
* Films wiith high percent profit tend to release on Wednesday, Thursday, and Friday respectively.

In [None]:
clean_df.groupby(['publish_month', 'publish_day']).agg({'ww_profit':'median'}).plot(figsize=(20,5))
plt.axhline(0, color='black')
clean_df.groupby(['publish_month', 'publish_day']).agg({'ww_percent_profit':'median'}).plot(figsize=(20,5), color='salmon')
plt.axhline(0, color='black')
plt.axhline(50, color='black', linestyle='--')

clean_df.groupby(['publish_month', 'publish_day', 'publish_dayofweek']).agg({'ww_profit':'median'})\
.plot(figsize=(20,5), color='salmon')
plt.axhline(0, color='black')

In [None]:
clean_df.groupby(['publish_month', 'publish_day']).agg({'budget':'median'})\
.sort_values(by='budget', ascending=False).head(5)

In [None]:
clean_df.groupby(['publish_month', 'publish_day']).agg({'ww_profit':'median'}).sort_values(by='ww_profit', ascending=False).head(5)

In [None]:
clean_df.groupby(['publish_month', 'publish_day']).agg({'ww_percent_profit':'median'}).sort_values(by='ww_percent_profit', ascending=False).head(5)

## 2. How release date influence imdb score and metascore?

In [None]:
tmp = clean_df[(clean_df['avg_vote'] >= 5) & (clean_df['metascore'] >= 50)]

In [None]:
# print(clean_df['avg_vote'].mean(), clean_df['avg_vote'].median())
# print(clean_df['metascore'].mean(), clean_df['metascore'].median())

### Publish Year

In [None]:
dual_plot_median(tmp, 'publish_year', 'avg_vote', 'metascore')
dual_plot_median(clean_df, 'publish_year', 'avg_vote', 'metascore')

### Release date(Month) and score

In [None]:
# Release date(Month) and score

# fig, ax1 = plt.subplots(figsize=(10,5))
# ax1.plot(tmp.groupby('publish_month').agg({'avg_vote':'median'}), color='steelblue')
# ax2 = ax1.twinx()
# ax2.plot(tmp.groupby('publish_month').agg({'metascore':'median'}), color='salmon')

dual_plot_median(tmp, 'publish_month', 'avg_vote', 'metascore')
dual_plot_median(clean_df, 'publish_month', 'avg_vote', 'metascore')

In [None]:
# IMDB score
clean_df.groupby('publish_month').agg({'avg_vote':'median'}).sort_values(by='avg_vote', ascending=False).head(5)

In [None]:
# Metascore
clean_df.groupby('publish_month').agg({'metascore':'median'}).sort_values(by='metascore', ascending=False).head(5)

* Films release in December, February, and January tend to get more imdb score respectively.
* Films release in December, February, and March tend to get more metascore respectively.

### Release date(Week) and score

In [None]:
# Release date(Week) and score

dual_plot_median(tmp, 'publish_week', 'avg_vote', 'metascore')
dual_plot_median(clean_df, 'publish_week', 'avg_vote', 'metascore')

In [None]:
# IMDB score
clean_df.groupby('publish_week').agg({'avg_vote':'median'}).sort_values(by='avg_vote', ascending=False).head(5)

In [None]:
# Metascore
clean_df.groupby('publish_week').agg({'metascore':'median'}).sort_values(by='metascore', ascending=False).head(5)

* Films release in week number 51, 49, and 50 tend to get more imdb score respectively.
* Films release in week number 51, 50, and 42 tend to get more metascore respectively.

### Release date(Day) and score

In [None]:
# Release date(Day) and score

dual_plot_median(tmp, 'publish_day', 'avg_vote', 'metascore')
dual_plot_median(clean_df, 'publish_day', 'avg_vote', 'metascore')

In [None]:
# IMDB score
clean_df.groupby('publish_day').agg({'avg_vote':'median'}).sort_values(by='avg_vote', ascending=False).head(5)

In [None]:
# Metascore
clean_df.groupby('publish_day').agg({'metascore':'median'}).sort_values(by='metascore', ascending=False).head(5)

* Films release on date 16, 19, and 18 tend to get more imdb score respectively.
* Films release on date 27, 16, and 18 tend to get more metascore respectively.

### Release date(Day of week) and score

In [None]:
# Release date(Day of week) and score
dual_plot_median(tmp, 'publish_dayofweek', 'avg_vote', 'metascore')
dual_plot_median(clean_df, 'publish_dayofweek', 'avg_vote', 'metascore')

In [None]:
# IMDB score
clean_df.groupby('publish_dayofweek').agg({'avg_vote':'median'}).sort_values(by='avg_vote', ascending=False).head(5)

In [None]:
# Metascore
clean_df.groupby('publish_dayofweek').agg({'metascore':'median'}).sort_values(by='metascore', ascending=False).head(5)

* Films release on Saturday, Tuesday, and Wednesday tend to get more imdb score respectively.
* Films release on Wednesday, Thursday, and Friday tend to get more metascore respectively.

In [None]:
clean_df.groupby(['publish_month', 'publish_day']).agg({'avg_vote':'median'}).plot(figsize=(20,5))
# plt.axhline(0, color='black')
clean_df.groupby(['publish_month', 'publish_day']).agg({'metascore':'median'}).plot(figsize=(20,5), color='salmon')
# plt.axhline(0, color='black')
# plt.axhline(50, color='black', linestyle='--')

In [None]:
clean_df.groupby(['publish_month', 'publish_day']).agg({'avg_vote':'median'})\
.sort_values(by='avg_vote', ascending=False).head(5)

In [None]:
clean_df.groupby(['publish_month', 'publish_day']).agg({'metascore':'median'})\
.sort_values(by='metascore', ascending=False).head(5)

* Films release on 16th December tend to get more imdb score respectively.
* Films release on 11th February tend to get more metascore respectively.

### 3. How release date releated to genre? // How genre influence release date?

ref: https://stackoverflow.com/questions/17775935/sql-like-window-functions-in-pandas-row-numbering-in-python-pandas-dataframe/48454871

In [None]:
tmp = clean_df.groupby(['publish_month', 'genre_1']).agg({'title':'count'})\
.sort_values(by=['publish_month','title'], ascending=[True,False])\
.reset_index()
tmp['rank'] = tmp.groupby(by='publish_month')['title'].transform(lambda x: x.rank(ascending=False))
tmp = tmp[tmp['rank'] <= 10]
tmp = tmp.loc[:, ['publish_month','genre_1','title']]
tmp = tmp.set_index('publish_month')
tmp_pivot = tmp.pivot(columns='genre_1')
tmp_pivot

In [None]:
tmp_pivot.plot(kind='bar', stacked = True, figsize=(10,5))
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
tmp_pivot.plot(kind='line', figsize=(10,5))
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

In [None]:
tmp = clean_df.groupby(['publish_month', 'genre_1']).agg({'ww_profit':'median'})\
.sort_values(by=['publish_month','ww_profit'], ascending=[True,False])\
.reset_index()

tmp['rank'] = tmp.groupby(by='publish_month')['ww_profit'].transform(lambda x: x.rank(ascending=False))
tmp = tmp[tmp['rank'] <= 3]
tmp = tmp.loc[:, ['publish_month','genre_1','ww_profit']]
tmp = tmp.set_index('publish_month')
tmp

tmp_pivot = tmp.pivot(columns='genre_1')
tmp_pivot

tmp_pivot.plot(kind='bar', stacked = True, figsize=(10,5))
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
# tmp_pivot.plot(kind='line', figsize=(10,5))
# plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

In [None]:
tmp_pivot

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(data=tmp_pivot, cmap='RdYlGn', vmax=0.9e+08, annot=True, ax=ax)