![](https://pbs.twimg.com/profile_images/789117657714831361/zGfknUu8_400x400.jpg)

## **1. Loading the dataset and libraries**

In [1]:
# package
import numpy as np 
import pandas as pd 
import os
import json
from pandas.io.json import json_normalize
import ast
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
%matplotlib notebook
from scipy.stats import skew, boxcox
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from mpl_toolkits.mplot3d import Axes3D
import ast
import re
import yaml
import json
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import eli5
import time
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import warnings  
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [2]:
# method
def date_features(df):
    df[['release_month','release_day','release_year']]=df['release_date'].str.split('/',expand=True).replace(np.nan, -1).astype(int)
    # 연도 끝 두자리수만 있기 때문에 앞에 19/20 붙이기
    df.loc[ (train['release_year'] <= 19) & (df['release_year'] < 100), "release_year"] += 2000
    df.loc[ (train['release_year'] > 19) & (df['release_year'] < 100), "release_year"] += 1900
    return df

def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

In [3]:
# Loading dataset
submission_path = "../input/sample_submission.csv"
train_path = "../input/train.csv"
test_path = "../input/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)
print( "train dataset:", train.shape,"\n","test dataset: ",test.shape,"\n","sample_submission dataset:", submission.shape)

train dataset: (3000, 23) 
 test dataset:  (4398, 22) 
 sample_submission dataset: (4398, 2)


In [4]:
# dataset 살펴보기
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
id                       3000 non-null int64
belongs_to_collection    604 non-null object
budget                   3000 non-null int64
genres                   2993 non-null object
homepage                 946 non-null object
imdb_id                  3000 non-null object
original_language        3000 non-null object
original_title           3000 non-null object
overview                 2992 non-null object
popularity               3000 non-null float64
poster_path              2999 non-null object
production_companies     2844 non-null object
production_countries     2945 non-null object
release_date             3000 non-null object
runtime                  2998 non-null float64
spoken_languages         2980 non-null object
status                   3000 non-null object
tagline                  2403 non-null object
title                    3000 non-null object
Keywords             

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 22 columns):
id                       4398 non-null int64
belongs_to_collection    877 non-null object
budget                   4398 non-null int64
genres                   4382 non-null object
homepage                 1420 non-null object
imdb_id                  4398 non-null object
original_language        4398 non-null object
original_title           4398 non-null object
overview                 4384 non-null object
popularity               4398 non-null float64
poster_path              4397 non-null object
production_companies     4140 non-null object
production_countries     4296 non-null object
release_date             4397 non-null object
runtime                  4394 non-null float64
spoken_languages         4356 non-null object
status                   4396 non-null object
tagline                  3535 non-null object
title                    4395 non-null object
Keywords            

## 1.1. **데이터 수집**
#### 기존 데이터의 문제점
- budget과 revenue의 단위 불일치
- 데이터의 지리적 수익 산출 기준 불일치
- 데이터가 축소되어 적용된 값 존재
- 모델 성능 개선 중 데이터 수집 부분
- 모델 정확성을 높이기 위해 이를 개선한 데이터를 적용

In [6]:
# train 데이터 문제점 개선
train.loc[train['id'] == 16,'revenue'] = 192864 # Skinning
train.loc[train['id'] == 90,'budget'] = 30000000 # Sommersby
train.loc[train['id'] == 118,'budget'] = 60000000 # Wild Hogs
train.loc[train['id'] == 149,'budget'] = 18000000 # Beethoven
train.loc[train['id'] == 313,'revenue'] = 12000000 # The Cookout
train.loc[train['id'] == 451,'revenue'] = 12000000 # Chasing Liberty
train.loc[train['id'] == 464,'budget'] = 20000000 # Parenthood
train.loc[train['id'] == 470,'budget'] = 13000000 # The Karate Kid, Part II
train.loc[train['id'] == 513,'budget'] = 930000 # From Prada to Nada
train.loc[train['id'] == 797,'budget'] = 8000000 # Welcome to Dongmakgol
train.loc[train['id'] == 819,'budget'] = 90000000 # Alvin and the Chipmunks: The Road Chip
train.loc[train['id'] == 850,'budget'] = 90000000 # Modern Times
train.loc[train['id'] == 1007,'budget'] = 2 # Zyzzyx Road
train.loc[train['id'] == 1112,'budget'] = 7500000 # An Officer and a Gentleman
train.loc[train['id'] == 1131,'budget'] = 4300000 # Smokey and the Bandit
train.loc[train['id'] == 1359,'budget'] = 10000000 # Stir Crazy
train.loc[train['id'] == 1542,'budget'] = 1 # All at Once
train.loc[train['id'] == 1570,'budget'] = 15800000 # Crocodile Dundee II
train.loc[train['id'] == 1571,'budget'] = 4000000 # Lady and the Tramp
train.loc[train['id'] == 1714,'budget'] = 46000000 # The Recruit
train.loc[train['id'] == 1721,'budget'] = 17500000 # Cocoon
train.loc[train['id'] == 1865,'revenue'] = 25000000 # Scooby-Doo 2: Monsters Unleashed
train.loc[train['id'] == 1885,'budget'] = 12 # In the Cut
train.loc[train['id'] == 2091,'budget'] = 10 # Deadfall
train.loc[train['id'] == 2268,'budget'] = 17500000 # Madea Goes to Jail budget
train.loc[train['id'] == 2491,'budget'] = 6 # Never Talk to Strangers
train.loc[train['id'] == 2602,'budget'] = 31000000 # Mr. Holland's Opus
train.loc[train['id'] == 2612,'budget'] = 15000000 # Field of Dreams
train.loc[train['id'] == 2696,'budget'] = 10000000 # Nurse 3-D
train.loc[train['id'] == 2801,'budget'] = 10000000 # Fracture
train.loc[train['id'] == 335,'budget'] = 2
train.loc[train['id'] == 348,'budget'] = 12
train.loc[train['id'] == 470,'budget'] = 13000000
train.loc[train['id'] == 513,'budget'] = 1100000
train.loc[train['id'] == 640,'budget'] = 6
train.loc[train['id'] == 696,'budget'] = 1
train.loc[train['id'] == 797,'budget'] = 8000000
train.loc[train['id'] == 850,'budget'] = 1500000
train.loc[train['id'] == 1199,'budget'] = 5
train.loc[train['id'] == 1282,'budget'] = 9 # Death at a Funeral
train.loc[train['id'] == 1347,'budget'] = 1
train.loc[train['id'] == 1755,'budget'] = 2
train.loc[train['id'] == 1801,'budget'] = 5
train.loc[train['id'] == 1918,'budget'] = 592
train.loc[train['id'] == 2033,'budget'] = 4
train.loc[train['id'] == 2118,'budget'] = 344
train.loc[train['id'] == 2252,'budget'] = 130
train.loc[train['id'] == 2256,'budget'] = 1
train.loc[train['id'] == 2696,'budget'] = 10000000

# test 데이터 문제점 개선

test
test.loc[test['id'] == 6733,'budget'] = 5000000
test.loc[test['id'] == 3889,'budget'] = 15000000
test.loc[test['id'] == 6683,'budget'] = 50000000
test.loc[test['id'] == 5704,'budget'] = 4300000
test.loc[test['id'] == 6109,'budget'] = 281756
test.loc[test['id'] == 7242,'budget'] = 10000000
test.loc[test['id'] == 7021,'budget'] = 17540562 # Two Is a Family
test.loc[test['id'] == 5591,'budget'] = 4000000 # The Orphanage
test.loc[test['id'] == 4282,'budget'] = 20000000 # Big Top Pee-wee
test.loc[test['id'] == 3033,'budget'] = 250
test.loc[test['id'] == 3051,'budget'] = 50
test.loc[test['id'] == 3084,'budget'] = 337
test.loc[test['id'] == 3224,'budget'] = 4
test.loc[test['id'] == 3594,'budget'] = 25
test.loc[test['id'] == 3619,'budget'] = 500
test.loc[test['id'] == 3831,'budget'] = 3
test.loc[test['id'] == 3935,'budget'] = 500
test.loc[test['id'] == 4049,'budget'] = 995946
test.loc[test['id'] == 4424,'budget'] = 3
test.loc[test['id'] == 4460,'budget'] = 8
test.loc[test['id'] == 4555,'budget'] = 1200000
test.loc[test['id'] == 4624,'budget'] = 30
test.loc[test['id'] == 4645,'budget'] = 500
test.loc[test['id'] == 4709,'budget'] = 450
test.loc[test['id'] == 4839,'budget'] = 7
test.loc[test['id'] == 3125,'budget'] = 25
test.loc[test['id'] == 3142,'budget'] = 1
test.loc[test['id'] == 3201,'budget'] = 450
test.loc[test['id'] == 3222,'budget'] = 6
test.loc[test['id'] == 3545,'budget'] = 38
test.loc[test['id'] == 3670,'budget'] = 18
test.loc[test['id'] == 3792,'budget'] = 19
test.loc[test['id'] == 3881,'budget'] = 7
test.loc[test['id'] == 3969,'budget'] = 400
test.loc[test['id'] == 4196,'budget'] = 6
test.loc[test['id'] == 4221,'budget'] = 11
test.loc[test['id'] == 4222,'budget'] = 500
test.loc[test['id'] == 4285,'budget'] = 11
test.loc[test['id'] == 4319,'budget'] = 1
test.loc[test['id'] == 4639,'budget'] = 10
test.loc[test['id'] == 4719,'budget'] = 45
test.loc[test['id'] == 4822,'budget'] = 22
test.loc[test['id'] == 4829,'budget'] = 20
test.loc[test['id'] == 4969,'budget'] = 20
test.loc[test['id'] == 5021,'budget'] = 40
test.loc[test['id'] == 5035,'budget'] = 1
test.loc[test['id'] == 5063,'budget'] = 14
test.loc[test['id'] == 5119,'budget'] = 2
test.loc[test['id'] == 5214,'budget'] = 30
test.loc[test['id'] == 5221,'budget'] = 50
test.loc[test['id'] == 4903,'budget'] = 15
test.loc[test['id'] == 4983,'budget'] = 3
test.loc[test['id'] == 5102,'budget'] = 28
test.loc[test['id'] == 5217,'budget'] = 75
test.loc[test['id'] == 5224,'budget'] = 3
test.loc[test['id'] == 5469,'budget'] = 20
test.loc[test['id'] == 5840,'budget'] = 1
test.loc[test['id'] == 5960,'budget'] = 30
test.loc[test['id'] == 6506,'budget'] = 11
test.loc[test['id'] == 6553,'budget'] = 280
test.loc[test['id'] == 6561,'budget'] = 7
test.loc[test['id'] == 6582,'budget'] = 218
test.loc[test['id'] == 6638,'budget'] = 5
test.loc[test['id'] == 6749,'budget'] = 8
test.loc[test['id'] == 6759,'budget'] = 50
test.loc[test['id'] == 6856,'budget'] = 10
test.loc[test['id'] == 6858,'budget'] = 100
test.loc[test['id'] == 6876,'budget'] = 250
test.loc[test['id'] == 6972,'budget'] = 1
test.loc[test['id'] == 7079,'budget'] = 8000000
test.loc[test['id'] == 7150,'budget'] = 118
test.loc[test['id'] == 6506,'budget'] = 118
test.loc[test['id'] == 7225,'budget'] = 6
test.loc[test['id'] == 7231,'budget'] = 85
test.loc[test['id'] == 5222,'budget'] = 5
test.loc[test['id'] == 5322,'budget'] = 90
test.loc[test['id'] == 5350,'budget'] = 70
test.loc[test['id'] == 5378,'budget'] = 10
test.loc[test['id'] == 5545,'budget'] = 80
test.loc[test['id'] == 5810,'budget'] = 8
test.loc[test['id'] == 5926,'budget'] = 300
test.loc[test['id'] == 5927,'budget'] = 4
test.loc[test['id'] == 5986,'budget'] = 1
test.loc[test['id'] == 6053,'budget'] = 20
test.loc[test['id'] == 6104,'budget'] = 1
test.loc[test['id'] == 6130,'budget'] = 30
test.loc[test['id'] == 6301,'budget'] = 150
test.loc[test['id'] == 6276,'budget'] = 100
test.loc[test['id'] == 6473,'budget'] = 100
test.loc[test['id'] == 6842,'budget'] = 30

# 2.1 **Data Pre-processing & FE(Feature Enginering)**

## **2.1.1. date_features**

In [7]:
train = date_features(train)
test = date_features(test)

## 2.1.2. 범주형 변수 변환

## **Json Format Columns to Dictionary Format**
- Json 형식의 변수들을 사용하기 위해 dictionary 형식으로 변환하여 분석에 이용.


- 현재 범주형 데이터의 경우 데이터 전처리에서의 문제점이 발생하였으므로, 해결한 후에 다시 feature로 사용.

In [8]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

train = text_to_dict(train)
test = text_to_dict(test)

## **Belong_to collection**

In [9]:
# train['collection_name'] = train['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
train['has_collection'] = train['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)

# test['collection_name'] = test['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
test['has_collection'] = test['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)

## **Genres**
<br>

In [10]:
train['num_genres'] = train['genres'].apply(lambda x: len(x) if x != {} else 0)
# train['all_genres'] = train['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

test['num_genres'] = test['genres'].apply(lambda x: len(x) if x != {} else 0)
# test['all_genres'] = test['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

## **production_companies**

In [11]:
train['num_production_companies'] = train['production_companies'].apply(lambda x: len(x) if x != {} else 0)
# train['all_production_companies'] = train['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

test['num_production_companies'] = test['production_companies'].apply(lambda x: len(x) if x != {} else 0)
# test['all_production_companies'] = test['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

## **production_countries**

In [12]:
train['num_production_countries'] = train['production_countries'].apply(lambda x: len(x) if x != {} else 0)
# train['all_production_countries'] = train['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

test['num_production_countries'] = test['production_countries'].apply(lambda x: len(x) if x != {} else 0)
# test['all_production_countries'] = test['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')


## **Cast**

In [13]:
# list_of_cast_names = list(train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# train['num_cast'] = train['cast'].apply(lambda x: len(x) if x != {} else 0)
# train['all_cast'] = train['cast'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

# test['num_cast'] = test['cast'].apply(lambda x: len(x) if x != {} else 0)
# test['all_cast'] = test['cast'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

## **Crew**

In [14]:
# list_of_crew_names = list(train['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
# train['num_crew'] = train['crew'].apply(lambda x: len(x) if x != {} else 0)
# train['all_crew'] = train['crew'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

# test['num_crew'] = test['crew'].apply(lambda x: len(x) if x != {} else 0)
# test['all_crew'] = test['crew'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

## **spoken_languages**

In [15]:
# list_of_spokenlanguage_names = list(train['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
train['num_spoken_languages'] = train['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
train['all_spoken_languages'] = train['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

test['num_spoken_languages'] = test['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
test['all_spoken_languages'] = test['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

## **keyword**

In [16]:
# list_of_Keywords = list(train['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
train['num_Keywords'] = train['Keywords'].apply(lambda x: len(x) if x != {} else 0)
# train['all_Keywords'] = train['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
test['num_Keywords'] = test['Keywords'].apply(lambda x: len(x) if x != {} else 0)
# test['all_Keywords'] = test['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')

In [17]:
# category

# cat_features = ['all_production_companies',
# 'all_production_countries',
# 'all_genres',                
# 'all_cast',
# 'all_crew',
# 'all_spoken_languages',
# 'all_Keywords',
# 'collection_name']
 
# for i in enumerate (cat_features) :
#     ca = i[1]
#     train[ca] = train[ca].astype('category')
#     test[ca] = test[ca].astype('category')


## 2.3. **Missing data**

- 결측치가 존재하는 변수들은 NaN값 유무에 따른 변수로 변환하여 분석에 이용.

## **homepage**

In [18]:
train['has_homepage'] = 1
train.loc[pd.isnull(train['homepage']) ,"has_homepage"] = 0
test['has_homepage'] = 1
test.loc[pd.isnull(test['homepage']) ,"has_homepage"] = 0

## **tagline**

In [19]:
train['isTaglineNA'] = 0
train.loc[pd.isnull(train['tagline']) ,"isTaglineNA"] = 1
test['isTaglineNA'] = 0
test.loc[pd.isnull(test['tagline']) ,"isTaglineNA"] = 1

## **original_languages**

In [20]:
train['isOriginalLanguageEng'] = 0
test['isOriginalLanguageEng'] = 0

train.loc[train['original_language'] == "en" ,"isOriginalLanguageEng"] = 1
test.loc[test['original_language'] == "en" ,"isOriginalLanguageEng"] = 1

## **spoken_languages**

In [21]:
train['isSpokenLanguageEng'] = 0
train.loc[train['all_spoken_languages'] == "English" ,"isSpokenLanguageEng"] = 1
test['isSpokenLanguageEng'] = 0
test.loc[test['all_spoken_languages'] == "English" ,"isSpokenLanguageEng"] = 1

## 2.4. **연속형 변수 변환**
- 치우친 분포를 가진 변수들에 대해서 log scale 적용.
- 마이너스 무한대로 가지 않도록 1을 더함.

In [22]:
# Putting log variables for skewed data 
train['log_budget']=np.log1p(train['budget'] + 1)
test['log_budget']=np.log1p(test['budget'] + 1)

train['log_runtime']=np.log1p(train['runtime'] + 1)
test['log_runtime']=np.log1p(test['runtime'] + 1)

# **3. FE(Feature Engineering)**
- 팀 아이디어를 통해 만든 새로운 변수 도입.
- 모델 성능 개선과 예측 정확도를 높이기 위함.

## **Difference between Original-title and title**
- 같은 언어의 제목을 사용하거나, 원래의 제목 그대로를 사용하는 것이 수익에 영향이 있는가?

In [23]:
train['isTitleDifferent'] = 1
train.loc[ train['original_title'] == train['title'] ,"isTitleDifferent"] = 0
test['isTitleDifferent'] = 1
test.loc[ test['original_title'] == test['title'] ,"isTitleDifferent"] = 0

## **budget/year ratio**
- 연도별 예산이 수익에 영향을 있는가?

In [24]:
train['budget_year_ratio'] = train['budget'] / (train['release_year'] * train['release_year']) 
test['budget_year_ratio'] = test['budget'] / (test['release_year'] * test['release_year'])

## Top 10 cast and crew
- 영화에 가장 많이 등장한 cast와 crew가 얼마나 있는지가 수익에 영향이 있는가?

In [25]:
# cast
train['all_cast'] = train['cast'].apply(lambda x: sorted([i['name'] for i in x]) if x != {} else '')
test['all_cast'] = test['cast'].apply(lambda x: sorted([i['name'] for i in x]) if x != {} else '')

list_of_cast_names_tr = list(train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_cast_names_tr = [m[0] for m in Counter([i for j in list_of_cast_names_tr for i in j]).most_common(10)]
list_of_cast_names_tt = list(test['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_cast_names_tt = [m[0] for m in Counter([i for j in list_of_cast_names_tt for i in j]).most_common(10)]


for g in top_cast_names_tr:
    train['cast_name_' + g] = train['all_cast'].apply(lambda x: 1 if g in x else 0)

for g in top_cast_names_tt:
    test['cast_name_' + g] = test['all_cast'].apply(lambda x: 1 if g in x else 0)

num_top_cast_tr = []
num_top_cast_tt = []

for i in range(len(train.index)):
    names = set(train.all_cast[i])
    topnames = set(top_cast_names_tr)
    num_top_cast_tr.append(len(names&topnames))
    
for i in range(len(test.index)):
    names = set(test.all_cast[i])
    topnames = set(top_cast_names_tt)
    num_top_cast_tt.append(len(names&topnames))

train["num_top_cast"] = num_top_cast_tr
test["num_top_cast"] = num_top_cast_tt

In [26]:
# crew

train['all_crew'] = train['crew'].apply(lambda x: sorted([i['name'] for i in x]) if x != {} else '')
test['all_crew'] = test['crew'].apply(lambda x: sorted([i['name'] for i in x]) if x != {} else '')

list_of_crew_names_tr = list(train['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_crew_names_tr = [m[0] for m in Counter([i for j in list_of_crew_names_tr for i in j]).most_common(10)]
list_of_crew_names_tt = list(test['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_crew_names_tt = [m[0] for m in Counter([i for j in list_of_crew_names_tt for i in j]).most_common(10)]


for g in top_crew_names_tr:
    train['crew_name_' + g] = train['all_crew'].apply(lambda x: 1 if g in x else 0)

for g in top_crew_names_tt:
    test['crew_name_' + g] = test['all_crew'].apply(lambda x: 1 if g in x else 0)

num_top_crew_tr = []
num_top_crew_tt = []

for i in range(len(train.index)):
    names = set(train.all_crew[i])
    topnames = set(top_crew_names_tr)
    num_top_crew_tr.append(len(names&topnames))
    
for i in range(len(test.index)):
    names = set(test.all_crew[i])
    topnames = set(top_crew_names_tt)
    num_top_crew_tt.append(len(names&topnames))

train["num_top_crew"] = num_top_crew_tr
test["num_top_crew"] = num_top_crew_tt

# **4. Data modeling**

In [27]:
# Formating for modeling

used_features = ['release_year',
                 'num_genres', 
                 'num_production_companies', 
                 'num_production_countries', 
                 'log_runtime',  
                 'num_spoken_languages',
                 'num_Keywords', 
                 'has_homepage',
                 'isTaglineNA',
                 'isTitleDifferent', 
                 'log_budget', 
                 'has_collection', 
                 'isOriginalLanguageEng',
                 'isSpokenLanguageEng',
                 'budget_year_ratio',
                 "num_top_crew",
                 "num_top_cast"]


X = train[used_features]
y = np.log1p(train['revenue'] + 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## **4.1. LGBM**
[LGBM 파라미터 조정](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)

In [28]:
# parameter

# params = {'num_leaves': 30,
# #          'min_data_in_leaf': 20,
# #          'objective': 'regression',
#          'max_depth': 5,
#          'learning_rate': 0.01,
#          "boosting": "gbdt"}
# #          "feature_fraction": 0.9,
# #          "bagging_freq": 1,
# #          "bagging_fraction": 0.9,
# #          "bagging_seed": 11,
# #          "metric": 'rmse',
# #          "lambda_l1": 0.2,
#          "verbosity": -1}

In [29]:
lgb_model = lgb.LGBMRegressor(nthread = 4, n_jobs = -1)
lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='rmse',
        verbose=1000, early_stopping_rounds=200)
print("{}".format(lgb_model.get_params))
print("훈련 점수: {:.2f}".format(lgb_model.score(X_train, y_train)))
print("테스트 점수: {:.2f}".format(lgb_model.score(X_test, y_test)))

eli5.show_weights(lgb_model, feature_filter=lambda x: x != '<BIAS>')

Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's l2: 1.66026	training's rmse: 1.28851	valid_1's l2: 4.22974	valid_1's rmse: 2.05663
<bound method LGBMModel.get_params of LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, nthread=4, num_leaves=31,
       objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0)>
훈련 점수: 0.82
테스트 점수: 0.52


Weight,Feature
0.5176,budget_year_ratio
0.1168,release_year
0.085,log_budget
0.0721,log_runtime
0.0536,num_Keywords
0.0403,num_production_companies
0.0255,has_collection
0.0186,num_genres
0.0128,num_spoken_languages
0.0126,isSpokenLanguageEng


## **4.2. XGBoost**

In [30]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='rmse',
        verbose=1000, early_stopping_rounds=200)

print("훈련 점수: {:.2f}".format(xgb_model.score(X_train, y_train)))
print("테스트 점수: {:.2f}".format(xgb_model.score(X_test, y_test)))

eli5.show_weights(xgb_model, feature_filter=lambda x: x != '<BIAS>')

[0]	validation_0-rmse:14.2337	validation_1-rmse:14.2164
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 200 rounds.
[99]	validation_0-rmse:1.78062	validation_1-rmse:2.04181
훈련 점수: 0.62
테스트 점수: 0.52


Weight,Feature
0.5293,budget_year_ratio
0.1039,log_budget
0.0473,isTaglineNA
0.0461,num_production_companies
0.0419,has_collection
0.0402,isOriginalLanguageEng
0.0307,isSpokenLanguageEng
0.0307,release_year
0.0251,num_Keywords
0.0219,num_spoken_languages


## **4.3. Esemble**
- 예정

# 5. **Result**
## **5.1. review**
- 추가적인 FE 필요
- 모델 파라미터 조정 필요
- 다른 방법의 모델 학습 필요 

## **5.2. 결과 제출**
- 데이터 개선 + FE + lgb base model

In [31]:
y_pred = lgb_model.predict(test[used_features])
submission['revenue'] = y_pred
submission.to_csv('2nd_submission.csv', index=False)

In [32]:
# 데이터 전처리 과정에서 문제점이 생긴 feature는 해결될 때까지 사용을 보류한다.
error_features = ['all_production_companies',
                  'all_production_countries',
                  'all_genres', 
                  'all_cast',
                  'all_crew',
                  'all_spoken_languages',
                  'all_Keywords',
                  'collection_name']