In [173]:
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd 
import numpy as np

# movies_train.csv / movies_test.csv
- title : 영화의 제목
- distributor : 배급사
- genre : 장르
- release_time : 개봉일
- time : 상영시간(분)
- screening_rat : 상영등급
- director : 감독이름
- dir_prev_bfnum : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화에서의 평균 관객수(단 관객수가 알려지지 않은 영화 제외)
- dir_prev_num : 해당 감독이 이 영화를 만들기 전 제작에 참여한 영화의 개수(단 관객수가 알려지지 않은 영화 제외)
- num_staff : 스텝수
- num_actor : 주연배우수
- box_off_num : 관객수

In [238]:
train = pd.read_csv('./data/movies_train.csv')
test = pd.read_csv('./data/movies_test.csv')

In [239]:
print(train.shape)
train.head()

(600, 12)


Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


## 결측치, 필요 없는 컬럼 제거

In [240]:
train.isnull().sum()

title               0
distributor         0
genre               0
release_time        0
time                0
screening_rat       0
director            0
dir_prev_bfnum    330
dir_prev_num        0
num_staff           0
num_actor           0
box_off_num         0
dtype: int64

In [241]:
# dir_prev_dfnum이 전체 데이터 600개 중 330개나 있으므로 제거
train.drop('dir_prev_bfnum', axis=1, inplace=True)
test.drop('dir_prev_bfnum', axis=1, inplace=True)

In [242]:
train['director'].nunique()

472

In [243]:
#title : 의미 없음, director : 너무 다양함
train.drop(['title','director'], axis=1, inplace=True)
test.drop(['title','director'], axis=1, inplace=True)

## distributor 그룹화

In [244]:
train['distributor'].value_counts()[:15]

CJ 엔터테인먼트    54
롯데엔터테인먼트     52
(주)NEW       30
(주)마운틴픽쳐스    29
인디스토리        26
(주)쇼박스       26
골든타이드픽처스     15
(주) 케이알씨지    14
영화사 진진       10
(주)키노아이      10
어뮤즈          10
시네마달         10
시너지          10
(주) 영화사조제    10
인디플러그         9
Name: distributor, dtype: int64

In [245]:
# # 상위 14개의 배급사를 제외하고 '기타'로처리
# distributor_list = train.distributor.value_counts()[:14]
# def func(distributor):
#     if distributor in distributor_list:
#         return distributor
#     else:
#         return '기타'

# train['distributor'] = train['distributor'].apply(lambda x : func(x))
# test['distributor'] = test['distributor'].apply(lambda x : func(x))

In [246]:
#상영 영화 수 대비 관객 수 계산
dis_rank = (train.groupby(['distributor']).sum()['box_off_num'] / train.groupby(['distributor']).count()['box_off_num']).sort_values(ascending=False).to_frame().reset_index()
dis_rank['rank'] = [i for i in range(1, len(dis_rank)+1)]
dis_rank.drop('box_off_num', axis=1, inplace=True)

dis_rank_test = (train.groupby(['distributor']).sum()['box_off_num'] / train.groupby(['distributor']).count()['box_off_num']).sort_values(ascending=False).to_frame().reset_index()
dis_rank_test['rank'] = [i for i in range(1, len(dis_rank_test)+1)]
dis_rank_test.drop('box_off_num', axis=1, inplace=True)

train = pd.merge(train, dis_rank, how='left', on=['distributor'])
test = pd.merge(test, dis_rank, how='left', on=['distributor'])


In [247]:
train

Unnamed: 0,distributor,genre,release_time,time,screening_rat,dir_prev_num,num_staff,num_actor,box_off_num,rank
0,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,0,91,2,23398,12
1,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,2,387,3,7072501,2
2,(주)쇼박스,액션,2013-06-05,123,15세 관람가,4,343,4,6959083,2
3,(주)NEW,코미디,2012-07-12,101,전체 관람가,2,20,6,217866,7
4,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,1,251,2,483387,4
...,...,...,...,...,...,...,...,...,...,...
595,(주)NEW,드라마,2014-08-13,111,청소년 관람불가,1,510,7,1475091,7
596,(주)쇼박스,드라마,2013-03-14,127,15세 관람가,1,286,6,1716438,2
597,(주)마운틴픽쳐스,공포,2010-09-30,99,청소년 관람불가,0,123,4,2475,50
598,CJ 엔터테인먼트,느와르,2015-05-14,102,15세 관람가,0,431,4,2192525,8


In [248]:
genre_rank = train.groupby('genre').count()['rank'].sort_values(ascending=False).to_frame().reset_index().rename(columns={'rank':'genre_rank'})
genre_rank_test = test.groupby('genre').count()['rank'].sort_values(ascending=False).to_frame().reset_index().rename(columns={'rank':'genre_rank'})

train = pd.merge(train, genre_rank, how='left', on=['genre'])
test = pd.merge(test, genre_rank_test, how='left', on=['genre'])

In [249]:
# 개봉일을 바탕으로 년,월 변수 생성
train['년'] = train['release_time'].apply(lambda x: int(x[:4]))
train['월'] = train['release_time'].apply(lambda x: int(x[5:7]))
train =  train.drop(['release_time'],axis = 1)

test['년'] = test['release_time'].apply(lambda x: int(x[:4]))
test['월'] = test['release_time'].apply(lambda x: int(x[5:7]))
test =  test.drop(['release_time'],axis = 1)

In [265]:
test = test.fillna(0)

In [266]:
train.head(2)

Unnamed: 0,distributor,genre,time,screening_rat,dir_prev_num,num_staff,num_actor,box_off_num,rank,genre_rank,년,월
0,80,10,96,3,0,91,2,23398,12,28,2012,11
1,21,2,130,3,2,387,3,7072501,2,27,2015,11


In [267]:
py_train = train.copy()

In [268]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cols = ['distributor','genre','screening_rat']
for i in cols:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])

In [269]:
train.head(2)

Unnamed: 0,distributor,genre,time,screening_rat,dir_prev_num,num_staff,num_actor,box_off_num,rank,genre_rank,년,월
0,80,10,96,3,0,91,2,23398,12,28,2012,11
1,21,2,130,3,2,387,3,7072501,2,27,2015,11


## 표준화

In [270]:
train_x = train.drop(['box_off_num'],axis= 1)
train_y = np.log1p(train['box_off_num'])

In [271]:
# from sklearn.preprocessing import MinMaxScaler
# # StandardScaler객체 생성
# scalerX = MinMaxScaler()
# scalerX.fit(train_x)
# train_scaled = scalerX.transform(train_x)
# train_scaled_df = pd.DataFrame(train_scaled, columns=train_x.columns)

# scalerX.fit(test)
# test_scaled = scalerX.transform(test)
# test_scaled_df = pd.DataFrame(test_scaled, columns=test.columns)

In [272]:
# scalery = MinMaxScaler()
# scalery.fit(train_y)
# y_train_scaled = scalery.transform(train_y)
# y_train_scaled_df = pd.DataFrame(y_train_scaled, columns=train_y.columns)

# 모델 정의 및 학습

In [273]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Ridge

model = AdaBoostRegressor(base_estimator=Ridge(alpha=2), n_estimators=200)

In [274]:
model.fit(train_x, train_y)

AdaBoostRegressor(base_estimator=Ridge(alpha=2), n_estimators=200)

In [275]:
print(model.score(train_x, train_y))

0.7542939061619774


In [276]:
test

Unnamed: 0,distributor,genre,time,screening_rat,dir_prev_num,num_staff,num_actor,rank,genre_rank,년,월
0,64,2,125,3,2,304,3,17.0,11,2010,1
1,16,5,113,0,4,275,3,2.0,34,2010,1
2,29,4,115,0,3,419,7,8.0,60,2010,1
3,16,10,116,1,2,408,2,2.0,17,2010,2
4,29,1,110,1,1,380,1,8.0,17,2010,2
...,...,...,...,...,...,...,...,...,...,...,...
238,42,4,78,1,1,4,4,0.0,60,2015,11
239,80,3,86,0,0,18,2,89.0,31,2015,10
240,23,2,107,3,0,224,4,0.0,11,2015,10
241,18,4,102,3,1,68,7,0.0,60,2015,10


In [277]:
y_pred = model.predict(test)

In [279]:
y_pred = np.expm1(y_pred)

In [282]:
submission = pd.read_csv('./data/submission.csv')
submission['box_off_num'] = y_pred
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,2.772928e+05
1,아빠가 여자를 좋아해,4.250159e+05
2,하모니,1.064749e+06
3,의형제,5.782136e+05
4,평행 이론,4.427773e+05
...,...,...
238,해에게서 소년에게,7.439780e+04
239,울보 권투부,3.680019e+03
240,어떤살인,4.373175e+05
241,말하지 못한 비밀,1.635003e+05


In [284]:
submission.to_csv('./data/submission_adaboost.csv', index=False)

# GradientBoosting

In [286]:
#GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
reg_gbr = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=3,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=9, min_samples_split=8,
                          min_weight_fraction_leaf=0.0, n_estimators=1250,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=0.8, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [302]:
#Hyper parameter tuning 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {'min_samples_split':[2,4,6,8,10,20,40,60,100], 
              'min_samples_leaf':[1,3,5,7,9, 15, 20, 25, 30, 40, 50],
              'subsample':[0.7,0.75,0.8,0.85,0.9,0.95,1],
              'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001, 0.2], 
              'n_estimators':[10, 30, 50, 100,250,500,750,1000,1250,1500,1750],
              'max_features' : ['sqrt']
             }

asdf = GradientBoostingRegressor()

#clf = GridSearchCV(asdf, param_grid=param_grid, scoring='r2', n_jobs=-1)
clf = RandomizedSearchCV(asdf, param_grid, scoring='r2', n_jobs=-1)
 
clf.fit(train_x, train_y)

print(clf.best_estimator_)

GradientBoostingRegressor(learning_rate=0.15, max_features='sqrt',
                          min_samples_split=6, subsample=0.85)


In [303]:
new_gbr = GradientBoostingRegressor(learning_rate=0.15, max_features='sqrt',
                          min_samples_split=6, subsample=0.85)

In [304]:
new_gbr.fit(train_x, train_y)

GradientBoostingRegressor(learning_rate=0.15, max_features='sqrt',
                          min_samples_split=6, subsample=0.85)

In [311]:
y_pred = new_gbr.predict(test)

In [312]:
y_pred = np.expm1(y_pred)

submission = pd.read_csv('./data/submission.csv')
submission['box_off_num'] = y_pred
submission.to_csv('./data/submission_gradient_boosting.csv', index=False)

In [313]:
pd.read_csv('./data/submission_gradient_boosting.csv')

Unnamed: 0,title,box_off_num
0,용서는 없다,5.866570e+05
1,아빠가 여자를 좋아해,1.465475e+06
2,하모니,9.804724e+05
3,의형제,1.901820e+06
4,평행 이론,1.056428e+06
...,...,...
238,해에게서 소년에게,2.538945e+04
239,울보 권투부,4.393784e+03
240,어떤살인,1.078715e+06
241,말하지 못한 비밀,1.265785e+05


In [308]:
submission.sort_values(by = 'box_off_num')

Unnamed: 0,title,box_off_num
181,나는 야한 여자가 좋다,1.680403e+02
194,나는 야한 여자가 좋다 3,1.680403e+02
75,마티나,2.323466e+02
215,플랑크 상수,2.953317e+02
237,어떤이의 꿈,3.511245e+02
...,...,...
142,박수건달,6.053284e+06
229,베테랑,7.344193e+06
179,군도: 민란의 시대,8.898511e+06
135,용의자,9.582618e+06


In [300]:
submission.sort_values(by = 'box_off_num')

Unnamed: 0,title,box_off_num
75,마티나,8.340251e+01
23,엄지아빠,9.875188e+01
237,어떤이의 꿈,2.072114e+02
49,겨울냄새,2.507341e+02
181,나는 야한 여자가 좋다,2.730425e+02
...,...,...
81,내가 살인범이다,4.361302e+06
83,늑대소년,4.715625e+06
135,용의자,4.818537e+06
229,베테랑,5.266898e+06
