# Project 2

<b>From Kaggle:</b>
<p> https://www.kaggle.com/c/tmdb-box-office-prediction </p>

<b>Goal:</b> Predicting Movie revenue from given dataset

### Step:
<ol>
    <li>Load Data</li>
    <li>Treat missing values</li>
    <li>Visualization</li>
    <li>Build Model</li>
</ol>

## 1. Load Data

In [None]:
import pandas as pd

train = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/train.csv')

print(train.shape)
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/test.csv')
print(test.shape)

In [None]:
data = pd.concat([train, test], sort=False).reset_index()
data = data.drop('index', axis=1)
print(data.shape)

Some columns has string in dictionary format.<br>
Let's translate string into dictionary

In [None]:
import ast

dict_columns = ['belongs_to_collection','genres','production_companies','production_countries','spoken_languages','Keywords','cast','crew']

def get_dict(item):
    try:
        new_item = ast.literal_eval(item)
    except:
        new_item = {}
    return new_item

for col in dict_columns:
#     train[col] = train[col].apply(lambda x: {} if pd.isnull(x) else ast.literal_eval(x))
#     test[col] = test[col].apply(lambda x: {} if pd.isnull(x) else ast.literal_eval(x))
#     data[col] = data[col].apply(lambda x: {} if pd.isnull(x) else ast.literal_eval(x))
#     train[col] = train[col].apply(lambda x: get_dict(x))
#     test[col] = test[col].apply(lambda x: get_dict(x))
    data[col] = data[col].apply(lambda x: get_dict(x))

## 2. Treat missing values

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

train_null_pct = train.isnull().sum().sort_values() / len(train)
test_null_pct = test.isnull().sum().sort_values() / len(test)

fig, ax = plt.subplots(1,2, figsize=(10,8), sharey=False)
fig.subplots_adjust(wspace=0.8)
ax[0].barh(train_null_pct.index, train_null_pct)
ax[0].set_title('Train dataset Null')
ax[0].set_xlabel('Null proportion')
ax[1].barh(test_null_pct.index, test_null_pct)
ax[1].set_title('Test dataset Null')
ax[1].set_xlabel('Null proportion')

print(data.isnull().sum())

<p>There are missing values in 'homepage', 'tagline', 'overview', 'poster_path', 'release_date', 'runtime', 'status', 'title'</p>

#### homepage
Create new column 'has_homepage', and put 1 if homepage is available, else 0<br>
And drop 'homepage' column 

In [None]:
data['has_homepage'] = data['homepage'].apply(lambda x: 0 if pd.isnull(x) else 1)
data = data.drop('homepage', axis=1)
data.shape

#### tagline

Fill empty string('') in missing values

In [None]:
data['tagline'] = data['tagline'].fillna('')

#### overview
Fill empty string('') in missing values

In [None]:
data['overview'] = data['overview'].fillna('')

#### poster_path
Drop 'poster_path' column. We won't analize pictures

In [None]:
data = data.drop('poster_path', axis=1)
data.shape

#### release_date
Since we have only 1 missing value in 'release_date' column, <br>
We can search on the internet

In [None]:
data.loc[data['release_date'].isnull(), 'title']
## Jails, Hospitals & Hip-Hop
# It released on May 2000
data.loc[data['release_date'].isnull(), 'release_date'] = '05/01/2000'

#### title
There are only 3 missing values on 'title' column.<br>
However, the 'original_title' is available, so we can search the english title on the internet

In [None]:
data.loc[data['title'].isnull(), ['id','original_title']]
data.loc[data['id']==5399, 'title'] = 'The Life of Guskou Budori'  #グスコーブドリの伝記
data.loc[data['id']==5426, 'title'] = ''  #La Vérité si je Mens ! 3  # couldn't find english title
data.loc[data['id']==6629, 'title'] = 'Barefoot'  #Barefoot

#### runtime
Fill missing value in 'runtime' column with median.<br>
Replace runtime 0 to median

In [None]:
data.loc[data['runtime'].isnull(), 'runtime'] = data['runtime'].median()
data.loc[data['runtime']==0, 'runtime'] = data['runtime'].median()

#### status

In [None]:
print(data['status'].isnull().sum())
print(train['status'].value_counts())
print(test['status'].value_counts())

'status' column has three different categorical value ('Released', 'Rumored', 'Post Production').<br>
However, only test dataset has 'Post Production' and only few movies are in different status.<br>
So, it seems to be not useful. Drop the column

In [None]:
data = data.drop('status', axis=1)
data.shape

## 3. Visualization

Since Movie data has many unique values, <br>
it is necessary to be transformed in representative values or categorical variables

#### revenue
Since revenue has big range, it is reasonable to take log on 'revenue'

In [None]:
data['revenue'].min()

In [None]:
import numpy as np

fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0].hist(data['revenue'])
ax[0].set_title('revenue')
ax[1].hist(np.log(data['revenue']+1))
ax[1].set_title('log_revenue')
plt.show()

data['log_revenue'] = np.log(data['revenue']+1)  # add 1 to avoid log(0)

#### belongs_to_collection

If it belongs to the collection, put 1 else 0

In [None]:
data['belongs_to_collection'] = data['belongs_to_collection'].apply(lambda x: len(x))

In [None]:
tmp_train = data.iloc[:3000]
isin_collection_rev = tmp_train.loc[tmp_train['belongs_to_collection']==1, 'revenue']
notin_collection_rev = tmp_train.loc[tmp_train['belongs_to_collection']==0, 'revenue']

isin_collection_rev_log = tmp_train.loc[tmp_train['belongs_to_collection']==1, 'log_revenue']
notin_collection_rev_log = tmp_train.loc[tmp_train['belongs_to_collection']==0, 'log_revenue']

fig, ax = plt.subplots(1,2, figsize=(12,5))
ax[0].boxplot([isin_collection_rev_log, notin_collection_rev_log])
ax[0].set_title('log_revenue')
ax[0].set_xticklabels(['in_collection', 'not_in_collection'])
ax[1].boxplot([isin_collection_rev, notin_collection_rev])
ax[1].set_title('revenue')
ax[1].set_xticklabels(['in_collection', 'not_in_collection'])
plt.show()

The movies which are in collection has relatively high revenue than those not in collection

#### budget

Some movies have '0' budget accidently.<br>
However, this feature absolutely a great predictor for the revenue<br>

In [None]:
plt.hist(data['budget'])
plt.xlabel('Budget')
plt.ylabel('Counts')

'butget' has big scale, so it is also reasonable to take log on 'budget'

In [None]:
data['log_budget'] = np.log(data['budget']+1)  # add 1 to avoid log(0)
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0].scatter(data['budget'], data['revenue'], alpha=0.1)
ax[0].set_title('budget - revenue')
ax[0].set_xlabel('budget')
ax[0].set_ylabel('revenue')
ax[1].scatter(data['log_budget'], data['log_revenue'], alpha=0.1)
ax[1].set_title('log_budget - log_revenue')
ax[1].set_xlabel('log_budget')
ax[1].set_ylabel('log_revenue')

It looks like there is some relationship between 'budget' and 'revenue'

#### genres

In [None]:
gen_cnt = data['genres'].apply(lambda x: len(x)).value_counts()
plt.bar(gen_cnt.index, gen_cnt)
plt.xticks(range(gen_cnt.index.max()+1))
plt.title('Number of genres that movies in')
plt.xlabel('Number of genres')
plt.ylabel('Movie counts')
plt.show()

Most of movies are in 1~4 genres

In [None]:
import collections
from wordcloud import WordCloud

total_gen_list = []

def gen_list(x):
    for i in x:
        total_gen_list.append(i['name'])

        
fig, ax = plt.subplots(1,2, figsize=(20,7))
data['genres'].apply(lambda x: gen_list(x))
gen_cnt = collections.Counter(total_gen_list).most_common()
for gen, cnt in gen_cnt[::-1]:
    ax[0].barh(gen,cnt)
ax[0].set_title('Genre Frequencies')

wordcloud = WordCloud(background_color='white', width=800, height=500).generate_from_frequencies(dict(gen_cnt))
ax[1].axis('off')
ax[1].imshow(wordcloud, interpolation = 'bilinear')
ax[1].set_title('Genre majorities')
plt.show()

There are total 20 different genres for these movies.<br>
Make dummy variables for each genres

In [None]:
data['genres_list'] = data['genres'].apply(lambda x: [i['name'] for i in x])

for gen in dict(gen_cnt).keys():
    data['genre_'+gen] = data['genres_list'].apply(lambda x: 1 if gen in x else 0)

In [None]:
tmp_train = data[:3000]
plt.figure(figsize=(6,6))
for idx, gen in enumerate(dict(gen_cnt).keys()):
    plt.boxplot(tmp_train.loc[tmp_train['genre_'+gen]==1,'log_revenue'], labels=[gen], positions=range(idx, idx+1), vert=False)
plt.xlabel('log_revenue')

In [None]:
data = data.drop(['genres', 'genres_list'], axis=1)

#### imdb_id
This is unique id.<br>
Drop the column

In [None]:
data = data.drop('imdb_id', axis=1)

#### release_date
'release_date' column has 'mm/dd/yy' data<br>

In [None]:
data['year'] = data['release_date'].str.split('/').apply(lambda x: 2000+int(x[2]) if int(x[2]) < 19 else 1900+int(x[2]))
data.loc[data['year']==3900,'year'] = 2000   ## there is a typo in dataset
data['month'] = data['release_date'].str.split('/').apply(lambda x: int(x[0]))
data = data.drop('release_date', axis=1)

In [None]:
fig, ax = plt.subplots(2,2, figsize=(13,10))
ax[0,0].scatter(data['year'], data['revenue'], alpha=0.3, s=20)
ax[0,0].set_title('year - revenue')
ax[0,0].set_xlabel('year')
ax[0,0].set_ylabel('revenue')
ax[0,1].scatter(data['year'], data['log_revenue'], alpha=0.3, s=20)
ax[0,1].set_xlabel('year')
ax[0,1].set_ylabel('log_revenue')
ax[0,1].set_title('year - log_revenue')
ax[1,0].scatter(data['month'], data['revenue'], alpha=0.3, s=20)
ax[1,0].set_xlabel('month')
ax[1,0].set_ylabel('revenue')
ax[1,0].set_title('month - revenue')
ax[1,1].scatter(data['month'], data['log_revenue'], alpha=0.3, s=20)
ax[1,1].set_title('month - log_revenue')
ax[1,1].set_xlabel('month')
ax[1,1].set_ylabel('log_revenue')
plt.show()

It looks like there is some relationship between year & revenue<br><br>
Revenue seems to have some pattern along to the month<br>
Make dummy variables for month

In [None]:
data = pd.get_dummies(data, columns=['month'], drop_first=True)

#### crew
People tend to have high expectation if the director and writer of a movie are popular or well known.<br>
However, the directors and writers are too sparse in our dataset. <br>
It is hard to use it as predictor. We dropped the column.

In [None]:
def find_director(x):
    director=''
    for i,v in enumerate(x):
        if v['job']=='Director':
            director = v['name']
    return director

def find_writer(x):
    writer=''
    for i,v in enumerate(x):
        if v['job'] == 'Writer':
            writer = v['name']
    return writer

data['director'] = data['crew'].apply(lambda x: find_director(x))
data['writer'] = data['crew'].apply(lambda x: find_writer(x))


In [None]:
collections.Counter(data['director']).most_common(20)

In [None]:
collections.Counter(data['writer']).most_common(20)

In [None]:
data = data.drop(['director', 'writer', 'crew'], axis = 1)

#### cast
Top actors can affact the revenue.<br>
Select Top 100 actors and create dummy variables.<br>
The number of cast may affect the revenue.

In [None]:
data['cast_list'] = data['cast'].apply(lambda x: [i['name'] for i in x])
data['n_cast'] = data['cast'].apply(lambda x: len(x))

In [None]:
total_cast_list = []
def get_cast_list(x):
    total_cast_list.extend(x)

data['cast_list'].apply(lambda x: get_cast_list(x))
top_cast = list(dict(collections.Counter(total_cast_list).most_common(100)).keys())

In [None]:
collections.Counter(total_cast_list).most_common(20)

In [None]:
for cast in top_cast:
    data['cast_'+cast] = data['cast_list'].apply(lambda x: 1 if cast in x else 0)

In [None]:
tmp_train = data[:3000]
for idx, cast in enumerate(top_cast[:23][::-1]):
    cast_rev = tmp_train.loc[tmp_train['cast_'+cast]==1, 'log_revenue']
    plt.boxplot(cast_rev, positions=range(idx, idx+1), labels=[cast], vert=False)
plt.xlabel('log_revenue')

In [None]:
data = data.drop(['cast', 'cast_list'], axis=1)

#### original_language & spoken_languages
The language is important for targeting the market.<br><br>
With original_language, create dummy variables,<br>
with spoken_languages, create the number of spoken languages which can be indicator of market expansion

In [None]:
# choose language which 
top_langs = dict(data['original_language'].value_counts()[:17]).keys()

for lang in top_langs:
    data['lang_'+lang] = data['original_language'].apply(lambda x: 1 if lang == x else 0)

In [None]:
data['n_spoken_languages'] = data['spoken_languages'].apply(lambda x: len(x))

In [None]:
for lang in list(top_langs)[::-1]:
    plt.barh(lang, data.loc[data['lang_'+lang]==1, 'log_revenue'])
plt.xlabel('log_revenue')
plt.title('original_language')

In [None]:
plt.scatter(data['n_spoken_languages'], data['log_revenue'], alpha=.3)
plt.xlabel('n_spoken_languages')
plt.ylabel('log_revenue')
plt.title('spoken_languages')

In [None]:
data = data.drop(['original_language','spoken_languages','n_spoken_languages'], axis=1)

#### title, tagline, Keywords, overview
Combine all string information and create clusters for dividing topics.

In [None]:
data['keyword_str'] = data['Keywords'].apply(lambda x: ', '.join([i['name'] for i in x]))

In [None]:
data['text'] = data['title'] + '. '+ data['tagline'] + '. ' + data['overview'] + '. ' + data['keyword_str']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vect = CountVectorizer(ngram_range=(1,3), stop_words='english')
X = vect.fit_transform(data['text'])
lda = LatentDirichletAllocation(n_components=10, random_state = 0)
document_topics = lda.fit_transform(X)

In [None]:
n = 8
# Get features (tokens) from CountVectorizer
feature_names = np.array(vect.get_feature_names())
# Find top n tokens
topics = dict()
for idx, component in enumerate(lda.components_): 
    top_n_indices = component.argsort()[:-(n + 1): -1] 
    topic_tokens = [feature_names[i] for i in top_n_indices] 
    topics[idx] = topic_tokens

topics

In [None]:
plt.figure(figsize=(5,5))
for k, v in collections.Counter(document_topics.argmax(axis=1)).items():
    plt.bar(k,v)
plt.xlabel('Topic clusters')
plt.ylabel('Number of movies')
plt.title('Topic frequencies')

In [None]:
data['topics'] = document_topics.argmax(axis=1)
tmp_train = data[:3000]
plt.figure(figsize=(5,5))
for idx in range(10):
    plt.boxplot(tmp_train.loc[tmp_train['topics']==idx, 'log_revenue'],positions=range(idx, idx+1), labels=[idx])
plt.xlabel('topics')
plt.ylabel('log_revenue')

In [None]:
data = pd.get_dummies(data, columns=['topics'], drop_first=True)

In [None]:
data = data.drop(['original_title','overview','tagline','title','Keywords', 'keyword_str','text'], axis=1)

#### production_companies, production_countries
Expecting more countries and companies participated in filmmaking, more revenue will be collected.<br><br>
Some companies are garanteed to make high quality movies.<br>
Create dummy variables for the top companies.

In [None]:
data['n_production_countries'] = data['production_countries'].apply(lambda x: len(x))
data['n_production_companies'] = data['production_companies'].apply(lambda x: len(x))

In [None]:
company_list = []
def get_company_list(x):
    for i in x:
        company_list.append(i['name'])
data['production_companies'].apply(lambda x: get_company_list(x))
for company in dict(collections.Counter(company_list).most_common(30)).keys():
    data['production_'+company] = data['production_companies'].apply(lambda x: 1 if company in [i['name'] for i in x] else 0)

In [None]:
data = data.drop(['production_companies', 'production_countries'], axis=1)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,5))
ax[0].scatter(data['n_production_countries'], data['log_revenue'], alpha=0.1)
ax[0].set_xlabel('n_production_countries')
ax[0].set_ylabel('log_revenue')
ax[1].scatter(data['n_production_companies'], data['log_revenue'], alpha=0.1)
ax[1].set_xlabel('n_production_companies')
ax[1].set_ylabel('log_revenue')

In [None]:
tmp_train = data[:3000]
plt.figure(figsize=(10,10))
for idx, company in enumerate(dict(collections.Counter(company_list).most_common(30)).keys()):
    com_rev = tmp_train.loc[tmp_train['production_'+company]==1, 'log_revenue']
    plt.boxplot(com_rev, positions=range(idx, idx+1), labels = [company], vert=False)
plt.xlabel('log_revenue')

#### runtime
There would be the most favorable runtime.<br><br>
Cutting the runtime in percentile.<br>

In [None]:
data['runtime_cat'] = pd.qcut(data['runtime'],10, labels=False)

In [None]:
plt.scatter(data['runtime_cat'], data['log_revenue'], alpha=0.1)

In [None]:
tmp_train = data[:3000]
for i in range(10):
    plt.boxplot(tmp_train.loc[data['runtime_cat']==i,'log_revenue'], positions=range(i, i+1))
plt.xlabel('runtime categories')
plt.ylabel('log_revenue')

In [None]:
data = data = pd.get_dummies(data, columns=['runtime_cat'], drop_first=True)
data = data.drop(['runtime'],axis=1)

## 4. Model Building
Models:
<ul>
    <li>Multiple Regression</li>
    <li>SVM - SVR</li>
    <li>Random Forest Regression</li>
    <li>NN - MLPRegressor</li>
    <li>SGD Regressor</li>
</ul>

In [None]:
def rmse_score(y1, y2):
    return np.sqrt(np.power(y1-y2,2).mean())

In [None]:
X = data[:3000].drop(['id','revenue','log_revenue', 'budget'],axis=1)
y = data[:3000]['log_revenue']

sub_X = data[3000:].drop(['id','revenue','log_revenue','budget'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

#### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(X_train,y_train)
y_hat = LR_model.predict(X_test)
print(rmse_score(y_hat, y_test))
sub_y = LR_model.predict(sub_X)

In [None]:
sub_csv = pd.DataFrame({'id':data[3000:]['id'], 'revenue': np.exp(sub_y)})
sub_csv.to_csv('LR_predict.csv', index=False)

## 2.40021

#### SVR

In [None]:
from sklearn.svm import SVR

SVR_model = SVR(C = 5)
SVR_model.fit(X_train, y_train)

y_hat = SVR_model.predict(X_test)
print(rmse_score(y_hat, y_test))
sub_y = SVR_model.predict(sub_X)

In [None]:
sub_csv = pd.DataFrame({'id':data[3000:]['id'], 'revenue': np.exp(sub_y)})
sub_csv.to_csv('SVR_predict.csv', index=False)

## 2.27148   # c=1.0
## 2.21529   # c=5
## 2.21807   # c=10

#### RandomForest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor(random_state =0, n_estimators=500, max_depth=10)
RF_model.fit(X_train, y_train)

y_hat = RF_model.predict(X_test)
print(rmse_score(y_hat, y_test))
sub_y = RF_model.predict(sub_X)

In [None]:
sub_csv = pd.DataFrame({'id':data[3000:]['id'], 'revenue': np.exp(sub_y)})
sub_csv.to_csv('RF_predict.csv', index=False)

## 2.20717   #n_esimators=200, max_depth=8
## 2.20054   #n_estimators=500, max_depth=10

#### MLP Regressor

In [None]:
from sklearn.neural_network import MLPRegressor
MLP_model = MLPRegressor(random_state = 0, hidden_layer_sizes=(50,50))
MLP_model.fit(X_train, y_train)

y_hat = MLP_model.predict(X_test)
print(rmse_score(y_hat, y_test))
sub_y = MLP_model.predict(sub_X)

In [None]:
sub_csv = pd.DataFrame({'id':data[3000:]['id'], 'revenue': np.exp(sub_y)})
sub_csv.to_csv('MLP_predict.csv', index=False)

## 2.29088   hidden=(30,500)
## 2.40497   hidden=(100,)
## 2.26413   hidden=(200,30)
## 2.31559   hidden=(50,50)