# Try

- CV -> Data의 개수가 적어서 그런지 성능이 좋지 못함
- remove release_time -> 개봉 시기와의 관계를 모델이 잘 인지하지 못할 것이라고 생각하여 제외했으나 성능 저하
- 상영 등급에 따라 수치로 변경 -> 등급에 따른 차이를 수치형으로 반영하고자 했지만 카테고리로 생각하는 것이 더 합리적이라는 결과

# Todo

- 정규분포 변환

## 데이터 및 라이브러리 로딩

In [802]:
import numpy as np
import pandas as pd
import seaborn as sns
import os, sys, shutil, time
import matplotlib.pyplot as plt
import xgboost
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

In [803]:
ROOT_DIR        = './'
DATA_ROOT_DIR   = os.path.join(ROOT_DIR, 'Data')
RESLUT_DIR      = os.path.join(ROOT_DIR, 'Resluts')

In [804]:
train = pd.read_csv(os.path.join(DATA_ROOT_DIR, 'movies_train.csv'))
test = pd.read_csv(os.path.join(DATA_ROOT_DIR, 'movies_test.csv'))
submission = pd.read_csv(os.path.join(DATA_ROOT_DIR, 'submission.csv'))

## 데이터탐색

In [805]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           600 non-null    object 
 1   distributor     600 non-null    object 
 2   genre           600 non-null    object 
 3   release_time    600 non-null    object 
 4   time            600 non-null    int64  
 5   screening_rat   600 non-null    object 
 6   director        600 non-null    object 
 7   dir_prev_bfnum  270 non-null    float64
 8   dir_prev_num    600 non-null    int64  
 9   num_staff       600 non-null    int64  
 10  num_actor       600 non-null    int64  
 11  box_off_num     600 non-null    int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 56.4+ KB


In [806]:
train.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


In [807]:
test.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.2,4,275,3
2,하모니,CJ 엔터테인먼트,드라마,2010-01-28,115,12세 관람가,강대규,4206611.0,3,419,7
3,의형제,(주)쇼박스,액션,2010-02-04,116,15세 관람가,장훈,691342.0,2,408,2
4,평행 이론,CJ 엔터테인먼트,공포,2010-02-18,110,15세 관람가,권호영,31738.0,1,380,1


## 데이터 전처리

In [808]:
# Replace nan with zero
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [809]:
# 감독명 : 너무 다양해서 제거
train = train.drop(['director'],axis = 1)
test = test.drop(['director'],axis = 1)

In [810]:
# 제목 : 의미가 없기 때문에 제거
train = train.drop(['title'],axis= 1)
test = test.drop(['title'],axis= 1)

In [811]:
train['distributor'] = train['distributor'].apply(lambda x: x.replace('(주)','').replace(' ',''))
train['distributor'] = train['distributor'].replace('리틀빅픽처스','리틀빅픽쳐스')
train['distributor'] = train['distributor'].replace('마운틴픽처스','마운틴픽쳐스')
train['distributor'] = train['distributor'].replace('스폰지','스폰지이엔티')
train['distributor'] = train['distributor'].replace('KT','싸이더스')
train['distributor'] = train['distributor'].replace('싸이더스FNH','싸이더스')
train['distributor'] = train['distributor'].replace('조이앤시네마','조이앤컨텐츠그룹')
train['distributor'] = train['distributor'].replace('케이알씨지','조이앤컨텐츠그룹')
train['distributor'] = train['distributor'].replace('스크린조이','조이앤컨텐츠그룹')
train['distributor'] = train['distributor'].replace('드림팩트엔터테인먼트','조이앤컨텐츠그룹')
train['distributor'] = train['distributor'].replace('CJE&MPictures','CJ엔터테인먼트')
train['distributor'] = train['distributor'].replace('CJE&M영화부문','CJ엔터테인먼트')
train['distributor'] = train['distributor'].replace('CGV무비꼴라쥬','CGV아트하우스')

In [812]:
test['distributor'] = test['distributor'].apply(lambda x: x.replace('(주)','').replace(' ',''))
test['distributor'] = test['distributor'].replace('리틀빅픽처스','리틀빅픽쳐스')
test['distributor'] = test['distributor'].replace('마운틴픽처스','마운틴픽쳐스')
test['distributor'] = test['distributor'].replace('스폰지','스폰지이엔티')
test['distributor'] = test['distributor'].replace('KT','싸이더스')
test['distributor'] = test['distributor'].replace('싸이더스FNH','싸이더스')
test['distributor'] = test['distributor'].replace('조이앤시네마','조이앤컨텐츠그룹')
test['distributor'] = test['distributor'].replace('케이알씨지','조이앤컨텐츠그룹')
test['distributor'] = test['distributor'].replace('스크린조이','조이앤컨텐츠그룹')
test['distributor'] = test['distributor'].replace('드림팩트엔터테인먼트','조이앤컨텐츠그룹')
test['distributor'] = test['distributor'].replace('CJE&MPictures','CJ엔터테인먼트')
test['distributor'] = test['distributor'].replace('CJE&M영화부문','CJ엔터테인먼트')
test['distributor'] = test['distributor'].replace('CGV무비꼴라쥬','CGV아트하우스')

In [813]:
# # 영화 개수가 10개 이하인 배급사를 '기타'로처리
# distributor_list = list(train.distributor.value_counts()[train.distributor.value_counts() > 10].keys())
# def func(distributor):
#     if distributor in distributor_list:
#         return distributor
#     else:
#         return '기타'

# train['distributor'] = train['distributor'].apply(lambda x : func(x))
# test['distributor'] = test['distributor'].apply(lambda x : func(x))


In [814]:
# 상위 5개의 배급사를 제외하고 '기타'로처리
distributor_list = train.distributor.value_counts()[:5]
def func(distributor):
    if distributor in distributor_list:
        return distributor
    else:
        return '기타'

train['distributor'] = train['distributor'].apply(lambda x : func(x))
test['distributor'] = test['distributor'].apply(lambda x : func(x))


In [815]:
# 개봉일을 바탕으로 년,월 변수 생성
train['year'] = train['release_time'].apply(lambda x: str(x[:4]))
train['month'] = train['release_time'].apply(lambda x: str(x[5:7]))
train =  train.drop(['release_time'],axis = 1)

test['year'] = test['release_time'].apply(lambda x: str(x[:4]))
test['month'] = test['release_time'].apply(lambda x: str(x[5:7]))
test =  test.drop(['release_time'],axis = 1)

In [816]:
# # 상영 등급에 따라 수치로 변경 - 악영향
# def convert_rat(rat):
#     if rat == '전체 관람가':
#         return 0
#     elif rat == '12세 관람가':
#         return 1
#     elif rat == '15세 관람가':
#         return 2
#     elif rat == '청소년 관람불가':
#         return 3
    
# train['screening_rat'] = train['screening_rat'].apply(lambda x: convert_rat(x))
# test['screening_rat'] = test['screening_rat'].apply(lambda x: convert_rat(x))


In [817]:
# # num_actor column 삭제 - 악영향
# train = train.drop(['num_actor'],axis= 1)
# test = test.drop(['num_actor'],axis= 1)

In [818]:
# 배우 수 로그 변환
train['num_actor'] = np.log1p(train['num_actor'])
test['num_actor'] = np.log1p(test['num_actor'])

In [819]:
# 원핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

## 모델정의 및 학습

In [820]:
train_x = train.drop(['box_off_num'],axis= 1)
train_y = train['box_off_num']

In [821]:
# sc = MinMaxScaler()
# scaled_train_x  = sc.fit_transform(train_x)
# scaled_test     = sc.transform(test)

In [822]:
from sklearn.model_selection import KFold
kf = KFold(n_splits =  10, shuffle = True, random_state = 42)

In [823]:
model=RandomForestRegressor(n_estimators=100)
# model.fit(train_x,train_y)

In [824]:
from sklearn.metrics import mean_squared_error
rmse_list = []
rf_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(train_x, train_y) :
    tr_x, tr_train_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
    val_x, val_train_y = train_x.iloc[val_idx], train_y.iloc[val_idx]
    
    model.fit(tr_x, tr_train_y)
    
    pred = model.predict(val_x)
    sub_pred = model.predict(test)
    rmse = np.sqrt(mean_squared_error(val_train_y, pred))
    
    rmse_list.append(rmse)
    
    rf_pred += (sub_pred / 10)

### Xgboost

In [825]:
xgb_model = xgboost.XGBRegressor(n_estimators=100)
xgb_model.fit(train_x, train_y)

# 학습 된 모델로 예측 데이터 생성

In [826]:
pred = model.predict(test)

## 제출파일 생성

In [827]:
submission = pd.read_csv(os.path.join(DATA_ROOT_DIR, './submission.csv'))

In [828]:
submission['box_off_num'] = rf_pred

In [829]:
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,2343607.745
1,아빠가 여자를 좋아해,1284789.964
2,하모니,1865849.768
3,의형제,1609421.292
4,평행 이론,965423.642
...,...,...
238,해에게서 소년에게,95878.166
239,울보 권투부,8626.469
240,어떤살인,263972.711
241,말하지 못한 비밀,7758.831


In [830]:
now = time.strftime('%m%d_%H%M%S', time.localtime(time.time()))
submission.to_csv(os.path.join(RESLUT_DIR, f'baseline_{now}.csv'), index = False)