## 데이터 및 라이브러리 로딩

In [43]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np

train = pd.read_csv('movies_train.csv')
test = pd.read_csv('movies_test.csv')

In [44]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

## 데이터탐색

In [45]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           600 non-null    object 
 1   distributor     600 non-null    object 
 2   genre           600 non-null    object 
 3   release_time    600 non-null    object 
 4   time            600 non-null    int64  
 5   screening_rat   600 non-null    object 
 6   director        600 non-null    object 
 7   dir_prev_bfnum  270 non-null    float64
 8   dir_prev_num    600 non-null    int64  
 9   num_staff       600 non-null    int64  
 10  num_actor       600 non-null    int64  
 11  box_off_num     600 non-null    int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 56.4+ KB


In [46]:
train.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


In [47]:
test.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.2,4,275,3
2,하모니,CJ 엔터테인먼트,드라마,2010-01-28,115,12세 관람가,강대규,4206611.0,3,419,7
3,의형제,(주)쇼박스,액션,2010-02-04,116,15세 관람가,장훈,691342.0,2,408,2
4,평행 이론,CJ 엔터테인먼트,공포,2010-02-18,110,15세 관람가,권호영,31738.0,1,380,1


## 데이터 전처리

In [48]:
# 결측치가 많은 데이터 제거
train = train.drop(['dir_prev_bfnum'],axis = 1)
test =  test.drop(['dir_prev_bfnum'],axis = 1)

In [49]:
# 감독명 : 너무 다양해서 제거
train = train.drop(['director'],axis = 1)
test = test.drop(['director'],axis = 1)

In [50]:
# 제목 : 의미가 없기 때문에 제거
train = train.drop(['title'],axis= 1)
test = test.drop(['title'],axis= 1)

In [51]:
train.distributor.value_counts()

CJ 엔터테인먼트        54
롯데엔터테인먼트         52
(주)NEW           30
(주)마운틴픽쳐스        29
(주)쇼박스           26
                 ..
OAL(올)            1
(주)에이원 엔터테인먼트     1
(주)콘텐츠 윙          1
위더스필름             1
퍼스트런              1
Name: distributor, Length: 169, dtype: int64

In [52]:
# 상위 5개의 배급사를 제외하고 '기타'로처리
distributor_list = train.distributor.value_counts()[:5]
def func(distributor):
    if distributor in distributor_list:
        return distributor
    else:
        return '기타'

train['distributor'] = train['distributor'].apply(lambda x : func(x))
test['distributor'] = test['distributor'].apply(lambda x : func(x))


In [53]:
# 개봉일을 바탕으로 년,월 변수 생성
train['년'] = train['release_time'].apply(lambda x: int(x[:4]))
train['월'] = train['release_time'].apply(lambda x: int(x[5:7]))
train =  train.drop(['release_time'],axis = 1)

test['년'] = test['release_time'].apply(lambda x: int(x[:4]))
test['월'] = test['release_time'].apply(lambda x: int(x[5:7]))
test =  test.drop(['release_time'],axis = 1)


In [54]:
# 원핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [55]:
train_x = train.drop(['box_off_num'],axis= 1)
train_y = train['box_off_num']

In [56]:
# num_actor 로그변환
train_x_log_ac=train_x
train_x_log_ac['num_actor']=np.log1p(train_x_log_ac['num_actor'])
test_ac=test
test_ac['num_actor']=np.log1p(test_ac['num_actor'])

## 모델정의 및 학습

In [57]:
def print_best_params(model, params,train_x,train_y):
  grid_model=GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error',cv=10)
  grid_model.fit(train_x, train_y)
  rmse=np.sqrt(-1*grid_model.best_score_)
  print({'{0} 10 CV시 최적 평균 RMSE값:{1}, 최적 alpha:{2}'.format(model.__class__.__name__, np.round(rmse,4),grid_model.best_params_)})

In [58]:
#GridSearch-XGB
xgb_params={'n_estimators':[10,25,50,75,80,100,150],'learning_rate':[0.01,0.025,0.05]}
xgb_reg_ac=XGBRegressor(n_estimators=100, learning_rate=0.05, colsample_bytree=0.5, subsample=0.8)
print_best_params(xgb_reg_ac, xgb_params,train_x_log_ac,train_y)

{"XGBRegressor 10 CV시 최적 평균 RMSE값:1423333.903, 최적 alpha:{'learning_rate': 0.05, 'n_estimators': 75}"}


In [59]:
#GridSearch-LGBM
lgbm_params={'n_estimators':[10,50,75,100,1000],'learning_rate':[0.01,0.05,0.075,0.1]}
lgbm_reg=LGBMRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-1)
print_best_params(lgbm_reg, lgbm_params,train_x_log_ac,train_y)

{"LGBMRegressor 10 CV시 최적 평균 RMSE값:1488307.1457, 최적 alpha:{'learning_rate': 0.05, 'n_estimators': 50}"}


# 학습 된 모델로 예측 데이터 생성

In [60]:
#XGB로 예측 데이터생성
xgb_reg_ac=XGBRegressor(n_estimators=150, learning_rate=0.05,
                        colsample_bytree=0.5, subsample=0.8)
xgb_reg_ac.fit(train_x_log_ac,train_y)
pred_xgb_ac=xgb_reg_ac.predict(test_ac)



In [61]:
#LGBM으로 예측 데이터 생성
lgbm_reg_ac=LGBMRegressor(n_estimators=50,learning_rate=0.05,n_jobs=-1)
lgbm_reg_ac.fit(train_x_log_ac,train_y)
pred_lgbm_ac=lgbm_reg_ac.predict(test_ac)

In [62]:
#XGB와 LGBM을 합친 예측 데이터 생성
pred_xgb_lgbm_ac=0.6*pred_xgb_ac+0.4*pred_lgbm_ac

In [63]:
#예측 데이터는 관객수이므로 음수인 값을 0으로 변경
pred_xgb_lgbm_ac=([0 if x < 0 else x for x in pred_xgb_lgbm_ac]) 

## 제출파일 생성

In [64]:
submission_ac=pd.read_csv('submission.csv')
submission_ac
submission_ac['box_off_num']=pred_xgb_lgbm_ac
submission_ac
submission_ac.to_csv('xgb_lgbm_ac_0.csv',index=False,encoding='utf-8-sig')