# TMDB Box Office Prediction

In [None]:
# Import packages
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')
pd.set_option('max_columns', None)
import ast
from collections import Counter
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import shap
import time
from datetime import datetime
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

## Data Loading and EDA

Let's see some basic information first.

In [None]:
train = pd.read_csv('../input/tmdb-box-office-prediction/train.csv')
test = pd.read_csv('../input/tmdb-box-office-prediction/test.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# Check missing values
train.isnull().sum().sort_values(ascending=False)

In [None]:
# Check missing values
test.isnull().sum().sort_values(ascending=False)

In [None]:
# Revising some wrong information for training and test sets
# The information is from:
# https://www.kaggle.com/kamalchhirang/eda-feature-engineering-lgb-xgb-cat#Feature-Engineering-&-Prediction

train.loc[train['id'] == 16,'revenue'] = 192864          # Skinning
train.loc[train['id'] == 90,'budget'] = 30000000         # Sommersby          
train.loc[train['id'] == 118,'budget'] = 60000000        # Wild Hogs
train.loc[train['id'] == 149,'budget'] = 18000000        # Beethoven
train.loc[train['id'] == 313,'revenue'] = 12000000       # The Cookout 
train.loc[train['id'] == 451,'revenue'] = 12000000       # Chasing Liberty
train.loc[train['id'] == 464,'budget'] = 20000000        # Parenthood
train.loc[train['id'] == 470,'budget'] = 13000000        # The Karate Kid, Part II
train.loc[train['id'] == 513,'budget'] = 930000          # From Prada to Nada
train.loc[train['id'] == 797,'budget'] = 8000000         # Welcome to Dongmakgol
train.loc[train['id'] == 819,'budget'] = 90000000        # Alvin and the Chipmunks: The Road Chip
train.loc[train['id'] == 850,'budget'] = 90000000        # Modern Times
train.loc[train['id'] == 1007,'budget'] = 2              # Zyzzyx Road 
train.loc[train['id'] == 1112,'budget'] = 7500000        # An Officer and a Gentleman
train.loc[train['id'] == 1131,'budget'] = 4300000        # Smokey and the Bandit   
train.loc[train['id'] == 1359,'budget'] = 10000000       # Stir Crazy 
train.loc[train['id'] == 1542,'budget'] = 1              # All at Once
train.loc[train['id'] == 1570,'budget'] = 15800000       # Crocodile Dundee II
train.loc[train['id'] == 1571,'budget'] = 4000000        # Lady and the Tramp
train.loc[train['id'] == 1714,'budget'] = 46000000       # The Recruit
train.loc[train['id'] == 1721,'budget'] = 17500000       # Cocoon
train.loc[train['id'] == 1865,'revenue'] = 25000000      # Scooby-Doo 2: Monsters Unleashed
train.loc[train['id'] == 1885,'budget'] = 12             # In the Cut
train.loc[train['id'] == 2091,'budget'] = 10             # Deadfall
train.loc[train['id'] == 2268,'budget'] = 17500000       # Madea Goes to Jail budget
train.loc[train['id'] == 2491,'budget'] = 6              # Never Talk to Strangers
train.loc[train['id'] == 2602,'budget'] = 31000000       # Mr. Holland's Opus
train.loc[train['id'] == 2612,'budget'] = 15000000       # Field of Dreams
train.loc[train['id'] == 2696,'budget'] = 10000000       # Nurse 3-D
train.loc[train['id'] == 2801,'budget'] = 10000000       # Fracture
train.loc[train['id'] == 335,'budget'] = 2 
train.loc[train['id'] == 348,'budget'] = 12
train.loc[train['id'] == 470,'budget'] = 13000000 
train.loc[train['id'] == 513,'budget'] = 1100000
train.loc[train['id'] == 640,'budget'] = 6 
train.loc[train['id'] == 696,'budget'] = 1
train.loc[train['id'] == 797,'budget'] = 8000000 
train.loc[train['id'] == 850,'budget'] = 1500000
train.loc[train['id'] == 1199,'budget'] = 5 
train.loc[train['id'] == 1282,'budget'] = 9               # Death at a Funeral
train.loc[train['id'] == 1347,'budget'] = 1
train.loc[train['id'] == 1755,'budget'] = 2
train.loc[train['id'] == 1801,'budget'] = 5
train.loc[train['id'] == 1918,'budget'] = 592 
train.loc[train['id'] == 2033,'budget'] = 4
train.loc[train['id'] == 2118,'budget'] = 344 
train.loc[train['id'] == 2252,'budget'] = 130
train.loc[train['id'] == 2256,'budget'] = 1 
train.loc[train['id'] == 2696,'budget'] = 10000000

test.loc[test['id'] == 6733,'budget'] = 5000000
test.loc[test['id'] == 3889,'budget'] = 15000000
test.loc[test['id'] == 6683,'budget'] = 50000000
test.loc[test['id'] == 5704,'budget'] = 4300000
test.loc[test['id'] == 6109,'budget'] = 281756
test.loc[test['id'] == 7242,'budget'] = 10000000
test.loc[test['id'] == 7021,'budget'] = 17540562       #  Two Is a Family
test.loc[test['id'] == 5591,'budget'] = 4000000        # The Orphanage
test.loc[test['id'] == 4282,'budget'] = 20000000       # Big Top Pee-wee
test.loc[test['id'] == 3033,'budget'] = 250 
test.loc[test['id'] == 3051,'budget'] = 50
test.loc[test['id'] == 3084,'budget'] = 337
test.loc[test['id'] == 3224,'budget'] = 4  
test.loc[test['id'] == 3594,'budget'] = 25  
test.loc[test['id'] == 3619,'budget'] = 500  
test.loc[test['id'] == 3831,'budget'] = 3  
test.loc[test['id'] == 3935,'budget'] = 500  
test.loc[test['id'] == 4049,'budget'] = 995946 
test.loc[test['id'] == 4424,'budget'] = 3  
test.loc[test['id'] == 4460,'budget'] = 8  
test.loc[test['id'] == 4555,'budget'] = 1200000 
test.loc[test['id'] == 4624,'budget'] = 30 
test.loc[test['id'] == 4645,'budget'] = 500 
test.loc[test['id'] == 4709,'budget'] = 450 
test.loc[test['id'] == 4839,'budget'] = 7
test.loc[test['id'] == 3125,'budget'] = 25 
test.loc[test['id'] == 3142,'budget'] = 1
test.loc[test['id'] == 3201,'budget'] = 450
test.loc[test['id'] == 3222,'budget'] = 6
test.loc[test['id'] == 3545,'budget'] = 38
test.loc[test['id'] == 3670,'budget'] = 18
test.loc[test['id'] == 3792,'budget'] = 19
test.loc[test['id'] == 3881,'budget'] = 7
test.loc[test['id'] == 3969,'budget'] = 400
test.loc[test['id'] == 4196,'budget'] = 6
test.loc[test['id'] == 4221,'budget'] = 11
test.loc[test['id'] == 4222,'budget'] = 500
test.loc[test['id'] == 4285,'budget'] = 11
test.loc[test['id'] == 4319,'budget'] = 1
test.loc[test['id'] == 4639,'budget'] = 10
test.loc[test['id'] == 4719,'budget'] = 45
test.loc[test['id'] == 4822,'budget'] = 22
test.loc[test['id'] == 4829,'budget'] = 20
test.loc[test['id'] == 4969,'budget'] = 20
test.loc[test['id'] == 5021,'budget'] = 40 
test.loc[test['id'] == 5035,'budget'] = 1 
test.loc[test['id'] == 5063,'budget'] = 14 
test.loc[test['id'] == 5119,'budget'] = 2 
test.loc[test['id'] == 5214,'budget'] = 30 
test.loc[test['id'] == 5221,'budget'] = 50 
test.loc[test['id'] == 4903,'budget'] = 15
test.loc[test['id'] == 4983,'budget'] = 3
test.loc[test['id'] == 5102,'budget'] = 28
test.loc[test['id'] == 5217,'budget'] = 75
test.loc[test['id'] == 5224,'budget'] = 3 
test.loc[test['id'] == 5469,'budget'] = 20 
test.loc[test['id'] == 5840,'budget'] = 1 
test.loc[test['id'] == 5960,'budget'] = 30
test.loc[test['id'] == 6506,'budget'] = 11 
test.loc[test['id'] == 6553,'budget'] = 280
test.loc[test['id'] == 6561,'budget'] = 7
test.loc[test['id'] == 6582,'budget'] = 218
test.loc[test['id'] == 6638,'budget'] = 5
test.loc[test['id'] == 6749,'budget'] = 8 
test.loc[test['id'] == 6759,'budget'] = 50 
test.loc[test['id'] == 6856,'budget'] = 10
test.loc[test['id'] == 6858,'budget'] =  100
test.loc[test['id'] == 6876,'budget'] =  250
test.loc[test['id'] == 6972,'budget'] = 1
test.loc[test['id'] == 7079,'budget'] = 8000000
test.loc[test['id'] == 7150,'budget'] = 118
test.loc[test['id'] == 6506,'budget'] = 118
test.loc[test['id'] == 7225,'budget'] = 6
test.loc[test['id'] == 7231,'budget'] = 85
test.loc[test['id'] == 5222,'budget'] = 5
test.loc[test['id'] == 5322,'budget'] = 90
test.loc[test['id'] == 5350,'budget'] = 70
test.loc[test['id'] == 5378,'budget'] = 10
test.loc[test['id'] == 5545,'budget'] = 80
test.loc[test['id'] == 5810,'budget'] = 8
test.loc[test['id'] == 5926,'budget'] = 300
test.loc[test['id'] == 5927,'budget'] = 4
test.loc[test['id'] == 5986,'budget'] = 1
test.loc[test['id'] == 6053,'budget'] = 20
test.loc[test['id'] == 6104,'budget'] = 1
test.loc[test['id'] == 6130,'budget'] = 30
test.loc[test['id'] == 6301,'budget'] = 150
test.loc[test['id'] == 6276,'budget'] = 100
test.loc[test['id'] == 6473,'budget'] = 100
test.loc[test['id'] == 6842,'budget'] = 30

test.loc[test['release_date'].isnull() == True, 'release_date'] = '01/01/98'

There are some JSON format columns. Let's convert them to the dictionary format and analyse them at first.

In [None]:
# Convert JSON format columns to dictionary format
text_cols = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

def text_to_dict(df):
    for col in text_cols:
        df[col] = df[col].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x))
    return df

train = text_to_dict(train)
test = text_to_dict(test)

### Belongs to Collection

In [None]:
# Show top five columns
for i, e in enumerate(train['belongs_to_collection'][:5]):
    print(i, e)

In [None]:
# Return unique value and count
train['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0).value_counts()

In [None]:
# Bar plot of most frequent values
collections = train['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0).value_counts()[1:20]
fig = plt.figure(figsize=(8, 5))
sns.barplot(collections, collections.index)
plt.xlabel('Count')
plt.title('Top 20 Collecction Count')
plt.show()

Only about 20% of the rows have information about collections, and the rest are empty.
Both poster_path and backdrop_path are image information, so only collection name can be used for modeling.

In [None]:
# Show the distribution of revenue for movies with or without a collection
train['has_collection'] = train['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)
plt.figure(figsize=(8, 5))
sns.catplot(x='has_collection', y='revenue', data=train)
plt.xlabel('Does the movie belong to a collection?')
plt.ylabel('Revenue')
plt.show()

### Genres

In [None]:
# Show top five columns
for i, e in enumerate(train['genres'][:5]):
    print(i, e)

Some movies only have one type of genre, while others have more than one.

In [None]:
# Return unique value and count, and the frequency bar plot
genres_num = train['genres'].apply(lambda x: len(x) if x != {} else 0).value_counts()
print(genres_num)
fig = plt.figure(figsize=(8, 5))
sns.barplot(genres_num, genres_num.index, orient="h", order=genres_num.sort_values(ascending = False).index)
plt.xlabel('Count')
plt.title('Number of Genres in Movies')
plt.show()

In [None]:
# Extract genres of each film
genres_per = train['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
genres_per

In [None]:
# To show which genre is the most common
genres_count = Counter([i for j in genres_per for i in j]).most_common()
fig = plt.figure(figsize=(8, 5))
sns.barplot([val[1] for val in genres_count],[val[0] for val in genres_count])
plt.xlabel('Count')
plt.title('Top 20 Genre Count')
plt.show()

### Production Companies

In [None]:
# Show top five columns
for i, e in enumerate(train['production_companies'][:5]):
    print(i, e)

In [None]:
# Unique value and count, visualization
companies_num = train['production_companies'].apply(lambda x: len(x) if x != {} else 0).value_counts()
print(companies_num)
fig = plt.figure(figsize=(8, 5))
sns.barplot(companies_num, companies_num.index, orient="h", order=companies_num.sort_values(ascending = False).index)
plt.xlabel('Count')
plt.title('Number of Production Companies of Movies')
plt.show()

In [None]:
# Show top 20 production company (ranked by number of movies)
companies_per = train['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
companies_count = Counter([i for j in companies_per for i in j]).most_common(20)
fig = plt.figure(figsize=(8, 5))
sns.barplot([val[1] for val in companies_count],[val[0] for val in companies_count])
plt.xlabel('Count')
plt.title('Top 20 Production Company Count')
plt.show()

### Production Countries

In [None]:
# SHow top five columns
for i, e in enumerate(train['production_countries'][:5]):
    print(i, e)

In [None]:
# count
countries_num = train['production_countries'].apply(lambda x: len(x) if x != {} else 0).value_counts()
countries_num

In [None]:
# Show which country produced most movies
countries_per = train['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
countries_count = Counter([i for j in countries_per for i in j]).most_common(20)
fig = plt.figure(figsize=(8, 5))
sns.barplot([val[1] for val in countries_count],[val[0] for val in countries_count])
plt.xlabel('Count')
plt.title('Top 20 Production Country Count')
plt.show()

Most of movies are from US and UK.

### Spoken Language

In [None]:
# Top 5 columns
for i, e in enumerate(train['spoken_languages'][:5]):
    print(i, e)

In [None]:
# Unique values and frequency
languages_num = train['spoken_languages'].apply(lambda x: len(x) if x != {} else 0).value_counts()
print(languages_num)
fig = plt.figure(figsize=(8, 5))
sns.barplot(languages_num, languages_num.index, orient="h", order=languages_num.sort_values(ascending = False).index)
plt.xlabel('Count')
plt.title('Number of Spoken Languages of Movies')
plt.show()

In [None]:
# Show languages which are used frequently
languages_per = train['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
languages_count = Counter([i for j in languages_per for i in j]).most_common(20)
fig = plt.figure(figsize=(8, 5))
sns.barplot([val[1] for val in languages_count],[val[0] for val in languages_count])
plt.xlabel('Count')
plt.title('Top 20 Spoken Language Count')
plt.show()

Let's also look at the original language column.

In [None]:
train['original_language'].value_counts()[:10]

We can build dummy variables for it later.

### Keywords

In [None]:
# Show some columns
for i, e in enumerate(train['Keywords'][:5]):
    print(i, e)

In [None]:
# count and visualize
keywords_per = train['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
keywords_count = Counter([i for j in keywords_per for i in j]).most_common(20)
fig = plt.figure(figsize=(8, 5))
sns.barplot([val[1] for val in keywords_count],[val[0] for val in keywords_count])
plt.xlabel('Count')
plt.title('Top 20 Keywords Count')
plt.show()

In [None]:
# Create a word cloud for keywords
plt.figure(figsize = (10, 6))
text = ' '.join(['_'.join(i.split(' ')) for j in keywords_per for i in j])
wordcloud = WordCloud(max_font_size=None, collocations=False, background_color="white", width=1000, height=600).generate(text)
plt.imshow(wordcloud)
plt.title('Top keywords')
plt.axis("off")
plt.show()

### Cast

In [None]:
train['cast'][1][1]

In [None]:
# Most popular actors
cast_per = train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
cast_count = Counter([i for j in cast_per for i in j]).most_common(20)
fig = plt.figure(figsize=(8, 5))
sns.barplot([val[1] for val in cast_count],[val[0] for val in cast_count])
plt.xlabel('Count')
plt.title('Top 20 Actor Count')
plt.show()

### Crew

In [None]:
train['crew'][0][0]

In [None]:
# Show crew who participated in lots of movies
crew_per = train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
crew_count = Counter([i for j in crew_per for i in j]).most_common(20)
fig = plt.figure(figsize=(8, 5))
sns.barplot([val[1] for val in crew_count],[val[0] for val in crew_count])
plt.xlabel('Count')
plt.title('Top 20 Crew Count')
plt.show()

Now let's continue to explore other categorical variables and their relationships with revenue.

### Homepage

In [None]:
# Show unique value and count
train['homepage'].isna().value_counts()

In [None]:
# Ranked by frequency
train['homepage'].value_counts().sort_values(ascending=False)[:5]

Different movies have different homepage, so we don't need to create dummy variables for each webpage.        
A binary variable is enough.

In [None]:
# Show the distribution of revenue
train['has_homepage'] = (1 - train.homepage.isna())
plt.figure(figsize=(8, 5))
sns.catplot(x='has_homepage', y='revenue', data=train)
plt.xlabel('Does the movie have a homepage?')
plt.ylabel('Revenue')
plt.show()

### Status

In [None]:
train['status'].value_counts()

Only four movies havn't been released, so I think this column is not very useful for modeling.

### Original Language

In [None]:
# Show frequency
lang_counts = train['original_language'].value_counts()
plt.figure(figsize=(8, 5))
sns.barplot(lang_counts[:10],lang_counts[:10].index)
plt.title('Top 20 Original Language Count')
plt.ylabel('Original Language')
plt.xlabel('Revenue')
plt.show()

In [None]:
# Show top 15 languages
train['original_language'].value_counts().head(15)

In [None]:
# The percentage of English movies
(train['original_language'] == 'en').mean()

In [None]:
# Show the difference of revenue between movies in different languages
top20_lang = train.loc[train['original_language'].isin(lang_counts[:20].index),:]
plt.figure(figsize=(8, 5))
sns.catplot(x='original_language', y='revenue', data=top20_lang)
plt.title('Revenue of Top 20 Languages')
plt.xlabel('Original Language')
plt.ylabel('Revenue')
plt.show()

There is no obvious rule for the influence of language on revenue.

### Title

In [None]:
# A word cloud for title
text = ' '.join(train['title'].apply(lambda x:x if x is not np.nan else ''))
plt.figure(figsize = (10, 6))
wordcloud = WordCloud(max_font_size=None, collocations=False, background_color="white", width=1000, height=600).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words in Titles')
plt.axis("off")
plt.show()

In [None]:
train['is_title_different'] = 1 - (train['original_title'] == train['title'])
sns.catplot(x="is_title_different", y="revenue", data=train)
plt.xlabel('Does the movie have multiple titles?')
plt.ylabel('Revenue')
plt.show()

### Overview

In [None]:
# A word cloud for overview
text = ' '.join(train['overview'].apply(lambda x:x if x is not np.nan else ''))
plt.figure(figsize = (10, 6))
wordcloud = WordCloud(max_font_size=None, collocations=False, background_color="white", width=1000, height=600).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words in Overview')
plt.axis("off")
plt.show()

### Tagline

In [None]:
# A word cloud for tagline
text = ' '.join(train['tagline'].apply(lambda x:x if x is not np.nan else ''))
plt.figure(figsize = (10, 6))
wordcloud = WordCloud(max_font_size=None, collocations=False, background_color="white", width=1000, height=600).generate(text)
plt.imshow(wordcloud)
plt.title('Top Words in Tagline')
plt.axis("off")
plt.show()

In [None]:
# Does the tagline influence the revenue?
train['has_tagline'] = 1 - train['tagline'].isna()
sns.catplot(x="has_tagline", y="revenue", data=train)
plt.xlabel('Does the movie have a tagline?')
plt.ylabel('Revenue')
plt.show()

### Release Date

Before studying the numeric variables, let's fix and convert the release date column for time analysis.

In [None]:
# Fixes dates which are in 20xx (they are later than 2020, so all are wrong values)
def fix_date(x):
    year = x.split('/')[2]
    if int(year) <= 19:
        return x[:-2] + '20' + year
    else:
        return x[:-2] + '19' + year

train['release_date'] = train['release_date'].apply(lambda x: fix_date(x))
test['release_date'] = test['release_date'].apply(lambda x: fix_date(x))

# Extract date features
def date_features(df):
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['release_year'] = df['release_date'].dt.year
    df['release_month'] = df['release_date'].dt.month
    df['release_day'] = df['release_date'].dt.day
    #df['release_quarter'] = df['release_date'].dt.quarter
    df['release_dayofweek'] = df['release_date'].dt.dayofweek
    df.drop(columns=['release_date'], inplace=True)
    return df

train = date_features(train)
test = date_features(test)

In [None]:
# Number of movies by year
plt.figure(figsize=(15,6))
sns.countplot(train['release_year'].sort_values())
plt.title("Movie Release Count by Year")
plt.xlabel("Release Year")
plt.xticks(fontsize=8, rotation=90)
plt.show()

In [None]:
# Number of movies by month
plt.figure(figsize=(15,6))
sns.countplot(train['release_month'].sort_values())
plt.title("Movie Release Count by Month")
plt.xlabel("Release Month")
plt.show()

In [None]:
# Number of movies by day of month
plt.figure(figsize=(15,6))
sns.countplot(train['release_day'].sort_values())
plt.title("Movie Release Count by Day of Month")
plt.xlabel("Release Day of Month")
plt.show()

In [None]:
# Number of movies by day of week
plt.figure(figsize=(15,6))
sns.countplot(train['release_dayofweek'].sort_values())
plt.title("Movie Release Count by Day of Week")
plt.xlabel("Release Day of Week")
plt.gca().set_xticklabels(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]) # add labels
plt.show()

### Revenue

In [None]:
# Distribution of target variable
train['log_revenue'] = np.log1p(train['revenue'])
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.distplot(train['revenue'], kde=False)
plt.title('Distribution of Revenue')
plt.xlabel('Revenue')
plt.subplot(1, 2, 2)
sns.distplot(train['log_revenue'], kde=False)
plt.title('Distribution of Log of Revenue')
plt.xlabel('Log of Revenue')
plt.show()

Using log of revenue is better.

#### Revenue By Year

In [None]:
# Mean revenue by year
MeanRevenueByYear = train.groupby('release_year')['revenue'].agg('mean')
MeanRevenueByYear.plot(figsize=(15,6))
plt.xlabel('Year')
plt.ylabel('Revenue')
plt.title('Mean Revenue By Year')
plt.show()

In [None]:
# Mean revenue by month
MeanRevenueByMonth = train.groupby('release_month')['revenue'].agg('mean')
MeanRevenueByMonth.plot(figsize=(15,6), kind='bar')
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.title('Mean Revenue By Month')
plt.xticks(rotation=360)
plt.show()

In [None]:
# Mean revenue by day of month
MeanRevenueByDayOfMonth = train.groupby('release_day')['revenue'].agg('mean')
MeanRevenueByDayOfMonth.plot(figsize=(15,6), kind='bar')
plt.xlabel('Day Of Month')
plt.ylabel('Revenue')
plt.title('Mean Revenue By Day Of Month')
plt.xticks(rotation=360)
plt.show()

In [None]:
# Mean revenue by day of week
MeanRevenueByDayOfWeek = train.groupby('release_dayofweek')['revenue'].agg('mean')
MeanRevenueByDayOfWeek.plot(figsize=(15,6), kind='bar')
plt.xlabel('Day Of Week')
plt.ylabel('Revenue')
plt.title('Mean Revenue By Day Of Week')
plt.gca().set_xticklabels(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
plt.xticks(rotation=360)
plt.show()

### Budget

In [None]:
# Distribution of budget
train['log_budget'] = np.log1p(train['budget'])
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.distplot(train['budget'], kde=False)
plt.title('Distribution of Budget')
plt.xlabel('Budget')
plt.subplot(1, 2, 2)
sns.distplot(train['log_budget'], bins=30, kde=False)
plt.title('Distribution of Log of Budget')
plt.xlabel('Log of Budget')
plt.show()

In [None]:
# Mean budget by year
MeanBudgetByYear = train.groupby('release_year')['budget'].agg('mean')
MeanBudgetByYear.plot(figsize=(15,6))
plt.xlabel('Year')
plt.ylabel('Budget')
plt.title('Mean Budget By Year')
plt.show()

Like the revenue, the mean budget also increases year by year.

#### Budget vs Revenue

In [None]:
# Relationship between budget and revenue
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(x='budget', y='revenue', data=train)
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.subplot(1, 2, 2)
plt.scatter(x='log_budget', y='log_revenue', data=train)
plt.xlabel('Log of Budget')
plt.ylabel('Log of Revenue')
plt.show()

Obvious linear relationship between them.

### Popularity

In [None]:
# Distribution of popularity
train['log_popularity'] = np.log1p(train['popularity'])
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.distplot(train['popularity'], kde=False)
plt.title('Distribution of Popularity')
plt.xlabel('Popularity')
plt.subplot(1, 2, 2)
sns.distplot(train['log_popularity'], bins=30, kde=False)
plt.title('Distribution of Log of Popularity')
plt.xlabel('Log of Popularity')
plt.show()

In [None]:
# Relationship between popularity and revenue
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(x='popularity', y='revenue', data=train)
plt.xlabel('Popularity')
plt.ylabel('Revenue')
plt.subplot(1, 2, 2)
plt.scatter(x='popularity', y='log_revenue', data=train)
plt.xlabel('Popularity')
plt.ylabel('Log of Revenue')
plt.show()

The popularity doesn't have obvious relationship with the revenue.

### Runtime

In [None]:
# Distribution of runtime
sns.distplot(train['runtime'], kde=False)
plt.title('Distribution of Runtime')
plt.xlabel('Runtime')
plt.show()

In [None]:
# Mean runtime by year
MeanRuntimeByYear = train.groupby('release_year')['runtime'].agg('mean')
MeanRuntimeByYear.plot(figsize=(15,6))
plt.xlabel('Year')
plt.ylabel('Runtime')
plt.title('Mean Runtime By Year')
plt.show()

With the development and maturity of the film industry, the runtime tends to a certain range.    
This change has been especially obvious since the 1980s.

In [None]:
# Relationship between runtime and revenue
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.scatter(x='runtime', y='revenue', data=train)
plt.xlabel('Runtime')
plt.ylabel('Revenue')
plt.subplot(1, 2, 2)
plt.scatter(x='runtime', y='log_revenue', data=train)
plt.xlabel('Runtime')
plt.ylabel('Log of Revenue')
plt.show()

The runtime doesn't have obvious relationship with the revenue.

## Feature Engineering

Show relationships between numeric variables.

In [None]:
num_vars = ['revenue','budget','popularity','runtime','release_year','release_month','release_day','release_dayofweek']
mask = np.zeros_like(train[num_vars].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f,ax = plt.subplots(figsize=(10, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(train[num_vars].corr(), annot=True, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

There are strong relevances between budget, popularity, runtime and realse year, so create some interactions later.

In [None]:
def prepare_data(df):
    
    # belongs to collection
    
    df['has_collection'] = df['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)
    
    # homepage
    
    df['has_homepage'] = 1 - df['homepage'].isna()
    
    # status
    
    df['is_released'] = (df['status'] == 'Released')*1
       
    # original title
    
    df['original_title_letter_count'] = df['original_title'].str.len() 
    df['original_title_word_count'] = df['original_title'].str.split().str.len()
    
    # title
    
    df['title_letter_count'] = df['title'].str.len() 
    df['title_word_count'] = df['title'].str.split().str.len()
    
    # overview
    
    df['overview_letter_count'] = df['overview'].str.len() 
    df['overview_word_count'] = df['overview'].str.split().str.len()
    
    # tagline
    
    df['has_tagline'] = 1 - df['tagline'].isna()
    df['tagline_letter_count'] = df['tagline'].str.len()
    df['tagline_word_count'] = df['tagline'].str.split().str.len()
    
    # gender of cast
    
    df['genders_0_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    df['genders_1_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    df['genders_2_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    
    # gender of crew
    
    df['genders_0_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    df['genders_1_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    df['genders_2_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    
    # log
    
    df['log_budget'] = np.log1p(df['budget'])
    df['log_popularity'] = np.log1p(df['popularity'])
    
    # create new features about budget
        
    df['ratio_budget_runtime'] = (df['log_budget'] / df['runtime']).replace([np.inf,-np.inf,np.nan],0)
    df['ratio_budget_popularity'] = df['log_budget'] / df['log_popularity']
    df['ratio_budget_year'] = df['log_budget'] / df['release_year'] #
    df['budget_to_mean_year'] = df['log_budget'] / df.groupby('release_year')['log_budget'].transform('mean')
    
    # create new features about popularity
    
    df['ratio_popularity_year'] = df['log_popularity'] / df['release_year']
    df['popularity_to_mean_year'] = df['log_popularity'] / df.groupby('release_year')['log_popularity'].transform('mean')
    
    # create new features about runtime
    
    df['ratio_runtime_year'] = df['runtime'] / df['release_year']
    df['runtime_to_mean_year'] = df['runtime'] / df.groupby('release_year')['runtime'].transform('mean')
                 
    # fill in null values
    
    df.fillna(value=0.0, inplace = True)
    
    return df

In [None]:
train = prepare_data(train)
test = prepare_data(test)

In [None]:
# Process dist columns, create dummy variables for values with high frequency

def dist_processing(train, test, col, key): # cast character, crew job/department?
        
    value_list = train[col].apply(lambda x: [i[key] for i in x] if x != {} else [])
    value_count = Counter([i for j in value_list for i in j]).most_common()
    top_list = [m[0] for m in value_count if m[1] > 10] # regard freq>10 as high frequency
        
    train['num_' + col] = train[col].apply(lambda x: len(x) if x != {} else 0)
    train['all_' + col] = train[col].apply(lambda x: ' '.join(sorted([i[key] for i in x])) if x != {} else '')
    test['num_' + col] = test[col].apply(lambda x: len(x) if x != {} else 0)
    test['all_' + col] = test[col].apply(lambda x: ' '.join(sorted([i[key] for i in x])) if x != {} else '')
    
    for p in top_list:
        train[col + '_' + p] = train['all_' + col].apply(lambda x: 1 if p in x else 0)
        test[col + '_' + p] = test['all_' + col].apply(lambda x: 1 if p in x else 0)
    
    train = train.drop([col, 'all_' + col], axis=1)
    test = test.drop([col, 'all_' + col], axis=1)
    
    return train, test
    
text_cols = ['genres', 'production_companies', 'production_countries', 'Keywords', 'cast', 'crew']
    
for col in text_cols:
    train, test = dist_processing(train, test, col, 'name')
    
train, test = dist_processing(train, test, 'spoken_languages', 'iso_639_1')
    
# original language
    
value_count = Counter(train['original_language']).most_common()
top_list = [m[0] for m in value_count if m[1] > 10]

for p in top_list: 
    train['original_language_' + p] = train['original_language'].apply(lambda x: 1 if p in x else 0)
    test['original_language_' + p] = test['original_language'].apply(lambda x: 1 if p in x else 0)

In [None]:
# Delete useless columns
train = train.drop(['id', 'belongs_to_collection', 'homepage', 'status', 'original_language', 'original_title', 'title', 'overview',
                    'tagline', 'imdb_id', 'poster_path', 'budget', 'popularity', 'revenue', 'is_title_different'], axis=1)
test = test.drop(['id', 'belongs_to_collection', 'homepage', 'status', 'original_language', 'original_title', 'title', 'overview',
                  'tagline', 'imdb_id', 'poster_path', 'budget', 'popularity'], axis=1)

In [None]:
# Remove non-ASCII characters in feature names (for lightGBM)
newnames = []
for col in train.columns.values:
    encoded_string = col.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    newnames.append(decode_string)
train.columns = newnames

newnames = []
for col in test.columns.values:
    encoded_string = col.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    newnames.append(decode_string)
test.columns = newnames

In [None]:
# Check the number of columns in two data sets
print(train.shape)
print(test.shape)

## Modeling and Prediction

In [None]:
X = train.drop(['log_revenue'], axis=1)
y = train['log_revenue']

# Make the order of features consistent (for XGBoost)
X_col = list(X.columns)
test = test.loc[:,X_col]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=22)

In [None]:
# Use 10-fold cross-validation
n_fold = 10
folds = list(KFold(n_splits=n_fold, shuffle=True, random_state=22).split(X))

**Note: The next two parts are feature analysis and parameter search before formal modeling.**

### Relevance of Important Independent Features
Before creating certain interactions in feature engineering, run this part to check the correlation.    
***Important: Don't need to run in final modeling and predictions.***

In [None]:
# Train a LGB model
params = {'objective': 'regression',
          'num_leaves': 30,
          'min_data_in_leaf': 20,
          'max_depth': 5, #4
          'learning_rate': 0.005,
          'boosting': "gbdt",
          'feature_fraction': 0.9, #0.7
          'bagging_freq': 1,
          'bagging_fraction': 0.9,
          'bagging_seed': 22,
          'metric': 'rmse',
          'lambda_l1': 0.2,
          'verbosity': -1}
model = lgb.LGBMRegressor(**params, n_estimators = 100000, nthread = 4, n_jobs = -1)
model.fit(X_train, y_train, 
          eval_set=[(X_train, y_train), (X_valid, y_valid)], 
          eval_metric='rmse',
          verbose=False, 
          early_stopping_rounds=200)

In [None]:
# Explain the model
explainer = shap.TreeExplainer(model, X_train)
shap_values = explainer.shap_values(X_train)

In [None]:
# Create a summary plot
shap.summary_plot(shap_values, X_train)

In [None]:
# Create dependence plot for important features
top_cols = X_train.columns[np.argsort(shap_values.std(0))[::-1]][:10]
for col in top_cols:
    shap.dependence_plot(col, shap_values, X_train)

Many important features still have obvious linear or nonlinear relationships.      
Create more interactions.

In [None]:
def new_interactions(df):
    
    # Create new interacrions for important features
    
    df['ratio_budget_year2'] = df['log_budget'].fillna(0) / (df['release_year']*df['release_year'])
    df['ratio_year_budget'] = df['release_year'] / df['log_budget']
    df['popularity_to_budget_to_mean_year'] = df['popularity_to_mean_year'] / df['budget_to_mean_year']
    df['genders_2_crew_to_budget_to_mean_year'] = df['genders_2_crew'] / df['budget_to_mean_year']
    df['num_crew_to_ratio_popularity_year'] = df['num_crew'] / df['ratio_popularity_year']
    df['popularity_runtime_to_budget'] = df['log_popularity'] / df['ratio_budget_runtime']
    
    df['mean_budget_to_year'] = df['budget_to_mean_year'] / df['release_year']
    df['budget_to_runtime_to_year'] = df['ratio_budget_runtime'] / df['release_year']
    df['ratio_year_popularity'] = df['release_year'] / df['log_popularity']
    
    df.fillna(value=0.0, inplace = True)
    
    return df

X = new_interactions(X)
test = new_interactions(test)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=22)

### Find Optimal Parameter
I referenced a few notebooks and used grid search to find the optimal parameter set.      
***Important: Don't need to run in final modeling and predictions.***

In [None]:
# Grid Search for LGB
"""""""""""
lg = lgb.LGBMRegressor()
param_dist = {
    'learning_rate': [0.01,0.005],
    'boosting_type': ['gbdt'],
    'max_depth': [5,7,9],
    'num_leaves': [25,30,35,40],
    'min_data_in_leaf': [10,20,25],
    'feature_fraction': [0.7,0.8,0.9],
    'bagging_freq': [1],
    'bagging_fraction': [0.7,0.8,0.9],
    'lambda_l1': [0.2],
    'objective': ['regression'],
    'random_state': [22]
}
start = time.time()
lg_search = GridSearchCV(lg, param_grid=param_dist, cv = 3, scoring='neg_mean_squared_error', verbose=5, n_jobs=-1)
lg_search.fit(X_train,y_train)
end = time.time()
print('Time elapsed: {0:.2f} m'.format((end-start)/60))
lg_search.best_params_
"""""""""""

In [None]:
# Grid Search for XGB
"""""""""""
xg = xgb.XGBRegressor()
param_dist = {
    'objective': ['reg:squarederror'],
    'eta': [0.01],
    'gamma': [0,1,1.45], 
    'max_depth': [5,6,7],
    'min_child_weight': [1,3],
    'subsample': [0.6,0.7,0.8],
    'colsample_bytree': [0.6,0.7,0.8], 
    'colsample_bylevel': [0.5,1], 
    'seed': [22]
}
start = time.time()
xg_search = GridSearchCV(xg, param_grid=param_dist, cv = 3, scoring='neg_mean_squared_error', verbose=10, n_jobs=-1)
xg_search.fit(X_train, y_train)
end = time.time()
print('Time elapsed: {0:.2f} m'.format((end-start)/60))
xg_search.best_params_
"""""""""""

In [None]:
# Grid Search for CAT
"""""""""""
ct = CatBoostRegressor()
params = {
    'learning_rate': [0.002, 0.004, 0.01],
    'depth': [5,6,7],
    'l2_leaf_reg': [1,3,4,9],#default=3
    'colsample_bylevel': [0.7,0.8],
    'bagging_temperature': [0.2],
    'random_seed': [22]
}
start = time.time()
ct_search = GridSearchCV(ct, params, scoring='neg_mean_squared_error', cv = 3)
ct_search.fit(X_train, y_train)
end = time.time()
print('Time elapsed: {0:.2f} m'.format((end-start)/60))
ct_search.best_params_
"""""""""""

### Build Models and Functions

This function returns prediction results and feature importance plot(optional).

In [None]:
def models(X_train, X_valid, y_train, y_valid, test, model_type, params, plot_feature_importance=False):

    pred = np.zeros(test.shape[0])
    val_pred = np.zeros(X_valid.shape[0])
    scores = []
    feature_importance = pd.DataFrame()

    for fold_n, (train_index, valid_index) in enumerate(folds):
        
        print('Fold', fold_n, 'started at', time.ctime())
        X_train = X.loc[train_index,:]
        y_train = y[train_index]
        X_valid = X.loc[valid_index,:]
        y_valid = y[valid_index] 
    
        # lightGBM model
        
        if model_type == 'lgb':
            
            model = lgb.LGBMRegressor(**params, n_estimators = 100000, nthread = 4, n_jobs = -1)
            model.fit(X_train, y_train, 
                      eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                      eval_metric='rmse',
                      verbose=False, 
                      early_stopping_rounds=200)
            
            y_pred_valid = model.predict(X_valid, num_iteration = model.best_iteration_)
            y_pred = model.predict(test, num_iteration = model.best_iteration_)
            
              
        # XGBoost model 
        
        if model_type == 'xgb':
            
            model = xgb.XGBRegressor(**params, n_estimators = 10000)
            model.fit(X_train, y_train,
                    eval_set=[(X_train, y_train), (X_valid, y_valid)],
                    eval_metric='rmse',
                    verbose=False,
                    early_stopping_rounds=200)

            y_pred_valid = model.predict(X_valid, ntree_limit = model.best_ntree_limit)
            y_pred = model.predict(test, ntree_limit = model.best_ntree_limit)
            
        # CatBoost model  
        
        if model_type == 'cat':
            
            model = CatBoostRegressor(**params, iterations=20000, eval_metric = 'RMSE')
            model.fit(X_train, y_train, 
                      eval_set=(X_valid, y_valid), 
                      use_best_model=True, 
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(test)
        
        # Record scores
        
        scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5) # RMSE for valid
        pred += y_pred
        
        # Feature importance
        
        fold_importance = pd.DataFrame()
        fold_importance['feature'] = X.columns
        fold_importance['importance'] = model.feature_importances_
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
            
    pred /= n_fold
    print('Mean RMSE: {0:.5f}, std: {1:.5f}.'.format(np.mean(scores), np.std(scores)))
    
    # Plot the importance/weight of features
           
    feature_importance['importance'] /= n_fold
    if plot_feature_importance:
        cols = feature_importance[['feature', 'importance']].groupby('feature').mean().sort_values(
            by='importance', ascending=False)[:50].index
        best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

        plt.figure(figsize=(16, 12))
        sns.barplot(x='importance', y='feature', data=best_features.sort_values(by='importance', ascending=False))
        plt.title('Important Features (avg over folds)')
    
    return pred

### Training and Prediction

#### Training with LightGBM

In [None]:
# Best parameters by grid search
lgb_params = {'objective': 'regression',
              'num_leaves': 40,
              'min_data_in_leaf': 10,
              'max_depth': 7,
              'learning_rate': 0.01,
              'boosting': "gbdt",
              'feature_fraction': 0.9, 
              'bagging_freq': 1, 
              'bagging_fraction': 0.7,
              'bagging_seed': 22,
              'metric': 'rmse',
              'lambda_l1': 0.2,
              'verbosity': -1}
start = time.time()
lgb_pred = models(X_train, X_valid, y_train, y_valid, test, 'lgb', lgb_params, True)
end = time.time()
print('Time elapsed: {0:.2f} m'.format((end-start)/60))

#### Training with XGBoost

In [None]:
# Best parameters by grid search
xgb_params = {'objective': 'reg:squarederror',
              'eta': 0.01,
              'max_depth': 5,
              'min_child_weight': 3,
              'subsample': 0.8,
              'colsample_bytree': 0.7, #0.8
              'colsample_bylevel': 0.5, 
              #'gamma': 0, default
              'eval_metric': 'rmse',
              'seed': 22}
start = time.time()
xgb_pred = models(X_train, X_valid, y_train, y_valid, test, 'xgb', xgb_params, True)
end = time.time()
print('Time elapsed: {0:.2f} m'.format((end-start)/60))

#### Traning with CatBoost

In [None]:
# Best parameters by grid search
cat_params = {'learning_rate': 0.01,
              'depth': 7,
              'colsample_bylevel': 0.7,
              'bagging_temperature': 0.2,
              'l2_leaf_reg': 1, #add new
              'random_seed': 22,
              'allow_writing_files': False,
              'early_stopping_rounds': 200
             }
start = time.time()
cat_pred = models(X_train, X_valid, y_train, y_valid, test, 'cat', cat_params, True)
end = time.time()
print('Time elapsed: {0:.2f} m'.format((end-start)/60))

### Blending and Submitting

How to decide the percentage?      
Models with smaller RMSE have a larger percentage.

In [None]:
# Blending
test_pred = lgb_pred*0.4 + xgb_pred*0.2 + cat_pred*0.4

In [None]:
# Submit
sub = pd.read_csv('../input/tmdb-box-office-prediction/sample_submission.csv')
df_sub = pd.DataFrame()
df_sub['id'] = sub['id']
df_sub['revenue'] = np.expm1(test_pred)
df_sub.to_csv("submission.csv", index=False)