In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Setup

In [None]:
# Python ≥3.5 is required
import sys
#assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
#assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os # operator system

# to make this notebook's output stable across runs
np.random.seed(42)

# ======= To plot pretty figures =======

# Make your plot outputs appear and be stored within the notebook.
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Number format
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format

# For Black background only
COLOR = 'black'

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
mpl.rcParams['text.color'] = COLOR
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
SPLITTER = " "
#COLUMN_EXCLUDE_PATTERN = "id|revenue|homepage|title|overview|poster_path|tagline"
COLUMN_EXCLUDE_PATTERN = "id|revenue"

In [None]:
TEXT_FIELDS =[("belongs_to_collection", "id"),
                ("genres", "name"), 
                ("production_companies", "id"),                
                ("production_countries", "iso_3166_1"),
                ("spoken_languages", "name"),
                ("Keywords", "name"),
                ("cast", "name"),                         
                ("crew", "name"),                
             ]
TEXT_FIELDS2 =[ ("production_companies", "name"),
                ("belongs_to_collection", "name"),
                ("cast", "character"),  
                ("cast", "job"),  
                ("cast", "profile_path"),
                ("crew", "job"),
                ("crew", "department"),
             ]

## Get Data

In [None]:
import pandas as pd
data = pd.read_csv("/kaggle/input/tmdb-box-office-prediction/train.csv")
data.head()

In [None]:
data.info()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# plot histogram for all columns
data.hist(bins=50, figsize=(12,9)) # bins is number of groups of values
plt.show()

## Create a Test Set

In [None]:
from sklearn.model_selection import train_test_split
#train_set, test_set = train_test_split(data, test_size=0.2, random_state=42) # use 20% data for testing
train_set = data.copy()
test_set = pd.read_csv("/kaggle/input/tmdb-box-office-prediction/test.csv")

In [None]:
label_column = "revenue"
X_train = train_set.copy().drop(train_set.filter(regex=COLUMN_EXCLUDE_PATTERN), axis=1)
y_train = train_set[label_column].copy()

In [None]:
X_train.shape

## Data tuning 1

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class LimitedColumnsFilter(BaseEstimator, TransformerMixin):
    def __init__(self, filters): # no *args or **kargs
        self.filters = filters   
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):        
        return X.copy().filter(items=self.filters)

filters = ('budget', 'original_title', 'popularity', 'original_language') #test
result = LimitedColumnsFilter(filters).transform(X_train)
result.head()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

#Convert string "2/20/15" to datetime64 "2015-02-20"
class DateTimeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, replace=True): # no *args or **kargs
        self.replace = replace
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        clone_X = X.copy()                            
        for feature in X.select_dtypes(include=[np.object]).columns:
            try:
                clone_X[feature] = pd.to_datetime(X[feature], infer_datetime_format=True)
            except:
                pass
        return clone_X

result = DateTimeImputer().transform(X_train)
filters = list(X_train.filter(like="date").columns)
result[filters].head()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

#Split date "2015-02-20" to 3 more features _Y=205, _M=02, _D=20
class DateDissolver(BaseEstimator, TransformerMixin):
    def __init__(self, replace=False): # no *args or **kargs
        self.replace = replace
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        clone_X = X.copy()
        for feature in X.select_dtypes(include=[np.datetime64]).columns:
            if self.replace:
                clone_X = clone_X.drop([feature], axis=1)                  
            try:        
                clone_X['{0}_Y'.format(feature)] = X[feature].dt.year
                clone_X['{0}_M'.format(feature)] = X[feature].dt.month
                clone_X['{0}_D'.format(feature)] = X[feature].dt.day
            except:
                pass

        return clone_X

result = DateTimeImputer().transform(X_train)
result = DateDissolver(replace=True).transform(result)
filters = list(result.filter(like="date").columns)
result[filters].head()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class NumberFilter(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):      
        return X.copy().select_dtypes(include=[np.int64, np.float64])        

result = NumberFilter().transform(X_train)
result.head()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoryFilter(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):        
        return X.copy().select_dtypes(include=[np.object])        

result = CategoryFilter().transform(X_train)
result.columns

## Data Tunning 2

In [None]:
import ast

class InfoExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, field, replace=False):
        self.field = field
        self.replace = replace

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        clone_X = X.copy()   
        for feature, field_name in self.field:
            if self.replace:
                clone_X[feature] = X[feature].apply(lambda x: self.extract_field(x, field_name))
            else:
                clone_X["{0}_{1}".format(feature, field_name)] = X[feature].apply(lambda x: self.extract_field(x, field_name))
        return clone_X
        
    def extract_field(self, data, field_name):
        if(data is not np.nan):
            info = ast.literal_eval(data)            
            result = SPLITTER.join("{}".format(x[field_name]).replace(SPLITTER, "_") for x in info)
            return result
        return np.nan
    
infoExtractor = InfoExtractor(field=TEXT_FIELDS, replace=True)
result = infoExtractor.transform(X_train)
#filters = list(result.filter(regex="collection|genres|crew").columns)
filters = list(result.filter(regex="collection|cast|production|genres|languages|Keywords|crew").columns)
result[filters].head()

In [None]:
from keras.preprocessing.text import Tokenizer

class TextEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, field=None, replace=False):
        self.field = field
        self.replace = replace

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        clone_X = X.copy()
        if self.field is None:
            self.field = X.copy().select_dtypes(include=[np.object], exclude=[np.datetime64]).columns               
        for feature in self.field:               
            if self.replace:               
                clone_X[feature] = pd.Series(data=self.encode_textBySum(X[feature]), index=clone_X.index)
               #clone_X["{0}_{1}".format(feature, 'encode')] = pd.Series(data=self.encode_textForOneHot(X[feature]), index=clone_X.index)
            else:                
                clone_X["{0}_{1}".format(feature, 'count')] = pd.Series(data=self.encode_textBySum(X[feature]), index=clone_X.index)
               #clone_X["{0}_{1}".format(feature, 'encode')] = pd.Series(data=self.encode_textForOneHot(X[feature]), index=clone_X.index) 
        return clone_X
        
    def encode_textBySum(self, df_feature):
        tokenizer = Tokenizer()
        clone_feature = df_feature.copy().fillna('')        
        tokenizer.fit_on_texts(clone_feature)
       #encoded_docs = tokenizer.texts_to_matrix(clone_feature, mode='count')
        encoded_docs = tokenizer.texts_to_matrix(clone_feature, mode='tfidf')
        encoded_nums = np.sum(encoded_docs,axis=1) #sum encoded matrix/we may use other strategy later...              
        return encoded_nums
    
    def encode_textForOneHot(self, df_feature):
        tokenizer = Tokenizer()
        clone_feature = df_feature.copy().fillna('')        
        tokenizer.fit_on_texts(clone_feature)
        encoded_docs = tokenizer.texts_to_matrix(clone_feature, mode='binary')        
        encoded_onehot = pd.DataFrame(data=encoded_docs).applymap("{:1.0f}".format).apply("".join, axis=1)                      
        return encoded_onehot

# infoExtractor = InfoExtractor(field=TEXT_FIELDS, replace=True)
# textEncoder = TextEncoder(replace=True)
infoExtractor = InfoExtractor(field=[("cast", "name")], replace=True)                
textEncoder = TextEncoder(field=["homepage", "cast"], replace=True)
result = infoExtractor.transform(X_train)
result = textEncoder.transform(result)
filters = list(result.filter(regex="date|cast|homepage").columns)
#filters = list(result_text.filter(regex="collection|cast|production|genres|languages|Keywords|crew").columns)
result[filters].head()

## Transformation Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

text_pipeline = Pipeline([        
       #('cat_filter'  , CategoryFilter()),                
       #('inf_extor2'  , InfoExtractor(field=TEXT_FIELDS2, replace=False)),
        ('inf_extor'   , InfoExtractor(field=TEXT_FIELDS, replace=True)),
        ('txt_encoder' , TextEncoder(replace=True)),
    ])

result = DateTimeImputer().transform(X_train)
result = DateDissolver(replace=True).transform(result)
result = text_pipeline.fit_transform(result)
result.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([        
        ('num_filter', NumberFilter()),
        ('imputer'   , SimpleImputer(strategy="median")),     # fill nan/empty cells        
        ('mm_scaler' , MinMaxScaler(feature_range=(-1, 1))),  # feature scaling
       #('std_scaler', StandardScaler()),  # feature scaling
    ])

result = num_pipeline.fit_transform(X_train)
pd.DataFrame(data=result).describe()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

cat_pipeline = Pipeline([
        ('cat_filter', CategoryFilter()),
        ('imputer'   , SimpleImputer(strategy='constant', fill_value='Missing')),  # fill nan/empty cells
       #('cat'       , OneHotEncoder(handle_unknown='ignore', sparse=False)),       
    ])

filters = ('budget', 'original_language') #test
result = LimitedColumnsFilter(filters).transform(X_train)
result = cat_pipeline.fit_transform(result)
result[0]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector


full_pipeline = make_column_transformer(              
    (num_pipeline , make_column_selector(dtype_include=[np.int64, np.float64])),            
   #(cat_pipeline , make_column_selector(dtype_include=np.object)),
)

# full_pipeline = ColumnTransformer([
#     ("num", num_pipeline, list(getNumberColumns(X_train))),
#     ("cat", cat_pipeline, list(getCategoryColumns(X_train))),
# ])

filters = list(X_train.filter(regex="date|budget|original_language").columns) #test
result = LimitedColumnsFilter(filters).transform(X_train)
result = full_pipeline.fit_transform(result)
result[0]

## Models & Scoring

In [None]:
#filters = list(X_train.filter(regex="date|budget|original_language").columns) #test
#X_train_pp_df = LimitedColumnsFilter(filters).transform(X_train)

X_train_pp_df = DateTimeImputer().transform(X_train)
X_train_pp_df = DateDissolver(replace=True).transform(X_train_pp_df)
X_train_pp_df = text_pipeline.fit_transform(X_train_pp_df)
#cat_pipeline.fit_transform(X_train_pp_df)

X_train_pp = full_pipeline.fit_transform(X_train_pp_df)
X_train_pp[0]

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

svm_reg = SVR(kernel="linear")
svm_reg.fit(X_train_pp, y_train)
predictions = svm_reg.predict(X_train_pp)

mse = mean_squared_error(y_train, predictions)
print("rmse ->", np.sqrt(mse))
msle = mean_squared_log_error(y_train, predictions)
print("rmsle ->", np.sqrt(msle))

In [None]:
print("Predicts -> ", list(predictions[0:5]))
print("Labels   -> ", list(y_train[0:5]))

## Fine Tune Model

In [None]:
X_train_pp.shape

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
   #{'n_estimators': [10, 50, 100], 'max_features': [5, 10, 20, X_train_pp.shape[1]]},
    {'n_estimators': [10], 'max_features': [X_train_pp.shape[1]]},
   #{'n_estimators': [100], 'max_features': [5]},
  ]

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid
                         , cv=5, scoring='neg_mean_squared_log_error', return_train_score=True)
grid_search.fit(X_train_pp, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
import re

num_columns = NumberFilter().transform(X_train_pp_df).columns
cat_columns = CategoryFilter().transform(X_train_pp_df).columns
#cat_columns = cat_pipeline.named_steps['cat'].get_feature_names(input_features=cat_columns)
columns = list(num_columns) + list(cat_columns)
feature_importance = pd.Series(data= grid_search.best_estimator_.feature_importances_, index = np.array(columns))
#features_top = feature_importance.sort_values(ascending=False)[0:20].index
#features_top = list(map(lambda x: re.sub("_.+$", "", x), features_top)) #Remove _XX from OneHotEncoder

In [None]:
feature_importance.sort_values(ascending=False)[0:5].plot(figsize=(20, 10))
plt.show()

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

### Try Neuron Network

In [None]:
X_train_pp.shape

In [None]:
import tensorflow as tf
from tensorflow import keras

ann_model = keras.models.Sequential()
ann_model.add(keras.layers.Dense(5000 , activation="relu", input_shape=X_train_pp.shape[1:]))
ann_model.add(keras.layers.Dense(1000, activation="relu"))
ann_model.add(keras.layers.Dense(2000, activation="relu"))
ann_model.add(keras.layers.Dense(100, activation="relu"))
ann_model.add(keras.layers.Dense(500, activation="relu"))
ann_model.add(keras.layers.Dense(X_train_pp.shape[1], activation="relu"))
ann_model.add(keras.layers.Dense(1))

#model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
ann_model.compile(loss="mean_squared_logarithmic_error", optimizer=keras.optimizers.SGD(lr=1e-1))
history = ann_model.fit(X_train_pp, y_train, epochs=100, batch_size=32, validation_split=0.2)

In [None]:
#model.summary()
keras.utils.plot_model(ann_model, show_shapes=True)

## Predict the TEST

In [None]:
X_test = test_set.copy().drop(test_set.filter(regex=COLUMN_EXCLUDE_PATTERN), axis=1)

X_test_pp_df = DateTimeImputer().transform(X_test)
X_test_pp_df = DateDissolver(replace=True).transform(X_test_pp_df)
X_test_pp_df = text_pipeline.fit_transform(X_test_pp_df)

X_test_pp = full_pipeline.transform(X_test_pp_df)
X_test_pp[0]

In [None]:
from sklearn.metrics import mean_squared_log_error

#final_model = grid_search.best_estimator_
final_model = ann_model
final_predictions = final_model.predict(X_test_pp)
#final_msle = mean_squared_log_error(y_test, final_predictions)
#print("rmsle -> ", np.sqrt(final_mse))

In [None]:
print("Predicts -> ", list(final_predictions[0:5]))
#print("Labels   -> ", list(y_test[0:5]))

In [None]:
test_set['revenue'] = final_predictions
test_set[['id', 'revenue']].to_csv('./submission_hvdang.csv', header=True, index=False)

In [None]:
test_set[['id', 'revenue']].head()