## 데이터 및 라이브러리 로딩

In [70]:
pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe~=2.1.1
  Using cached MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.0.1
    Uninstalling MarkupSafe-2.0.1:
      Successfully uninstalled MarkupSafe-2.0.1
Successfully installed markupsafe-2.1.1


In [71]:
!pip install markupsafe==2.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe==2.0.1
  Using cached MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (31 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.1.1
    Uninstalling MarkupSafe-2.1.1:
      Successfully uninstalled MarkupSafe-2.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires markupsafe~=2.1.1, but you have markupsafe 2.0.1 which is incompatible.[0m
Successfully installed markupsafe-2.0.1


In [72]:
import pandas as pd
import seaborn as sns
import os, sys, shutil
import matplotlib.pyplot as plt
from pycaret.regression import *
from sklearn.ensemble import RandomForestRegressor


In [73]:
ROOT_DIR = './'
DATA_ROOT_DIR = os.path.join(ROOT_DIR, 'Data')

In [74]:
train = pd.read_csv(os.path.join(DATA_ROOT_DIR, 'movies_train.csv'))
test = pd.read_csv(os.path.join(DATA_ROOT_DIR, 'movies_test.csv'))
submission = pd.read_csv(os.path.join(DATA_ROOT_DIR, 'submission.csv'))

## 데이터탐색

In [75]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           600 non-null    object 
 1   distributor     600 non-null    object 
 2   genre           600 non-null    object 
 3   release_time    600 non-null    object 
 4   time            600 non-null    int64  
 5   screening_rat   600 non-null    object 
 6   director        600 non-null    object 
 7   dir_prev_bfnum  270 non-null    float64
 8   dir_prev_num    600 non-null    int64  
 9   num_staff       600 non-null    int64  
 10  num_actor       600 non-null    int64  
 11  box_off_num     600 non-null    int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 56.4+ KB


In [76]:
train.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


In [77]:
test.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.2,4,275,3
2,하모니,CJ 엔터테인먼트,드라마,2010-01-28,115,12세 관람가,강대규,4206611.0,3,419,7
3,의형제,(주)쇼박스,액션,2010-02-04,116,15세 관람가,장훈,691342.0,2,408,2
4,평행 이론,CJ 엔터테인먼트,공포,2010-02-18,110,15세 관람가,권호영,31738.0,1,380,1


## 데이터 전처리

In [78]:
# Replace nan with zero
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [79]:
# 감독명 : 너무 다양해서 제거
train = train.drop(['director'],axis = 1)
test = test.drop(['director'],axis = 1)

In [80]:
# 제목 : 의미가 없기 때문에 제거
train = train.drop(['title'],axis= 1)
test = test.drop(['title'],axis= 1)

In [81]:
train.distributor.value_counts()

CJ 엔터테인먼트        54
롯데엔터테인먼트         52
(주)NEW           30
(주)마운틴픽쳐스        29
(주)쇼박스           26
                 ..
OAL(올)            1
(주)에이원 엔터테인먼트     1
(주)콘텐츠 윙          1
위더스필름             1
퍼스트런              1
Name: distributor, Length: 169, dtype: int64

In [82]:
# 상위 5개의 배급사를 제외하고 '기타'로처리
distributor_list = train.distributor.value_counts()[:5]
def func(distributor):
    if distributor in distributor_list:
        return distributor
    else:
        return '기타'

train['distributor'] = train['distributor'].apply(lambda x : func(x))
test['distributor'] = test['distributor'].apply(lambda x : func(x))


In [83]:
train.drop(['box_off_num'], axis=1)

Unnamed: 0,distributor,genre,release_time,time,screening_rat,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,0.00,0,91,2
1,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,1161602.50,2,387,3
2,(주)쇼박스,액션,2013-06-05,123,15세 관람가,220775.25,4,343,4
3,(주)NEW,코미디,2012-07-12,101,전체 관람가,23894.00,2,20,6
4,기타,코미디,2010-11-04,108,15세 관람가,1.00,1,251,2
...,...,...,...,...,...,...,...,...,...
595,(주)NEW,드라마,2014-08-13,111,청소년 관람불가,3833.00,1,510,7
596,(주)쇼박스,드라마,2013-03-14,127,15세 관람가,496061.00,1,286,6
597,(주)마운틴픽쳐스,공포,2010-09-30,99,청소년 관람불가,0.00,0,123,4
598,CJ 엔터테인먼트,느와르,2015-05-14,102,15세 관람가,0.00,0,431,4


In [93]:
numeric_col = ['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']
categori_col = ['distributor', 'genre', 'screening_rat']

In [107]:
s = setup(train, target='box_off_num', numeric_features=numeric_col, ignore_features=categori_col)

Unnamed: 0,Description,Value
0,session_id,4809
1,Target,box_off_num
2,Original Data,"(600, 10)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(419, 24)"


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True,
                                      features_todrop=['distributor', 'genre',
                                                       'screening_rat'],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=['time',
                                                          'dir_prev_bfnum',
                                                          'dir_prev_num',
                                                          'num_staff',
                                                          'num_actor'],
                                      target='box_off_num', time_features=[])),
                ('imputer',
                 Sim

In [108]:
top3 = compare_models(n_select=3, fold=5, sort="RMSE")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,781971.2,2362648000000.0,1466474.0,0.2675,4.0375,1934.0809,0.046
omp,Orthogonal Matching Pursuit,769406.9,2363361000000.0,1474102.0,0.2505,3.9207,1144.8854,0.02
lr,Linear Regression,841347.8,2440944000000.0,1505112.0,0.214,4.2371,1761.6198,0.028
lightgbm,Light Gradient Boosting Machine,753753.8,2427069000000.0,1507739.0,0.1859,3.6868,451.3162,0.04
ridge,Ridge Regression,841570.7,2458764000000.0,1511413.0,0.2059,4.2381,1695.2493,0.036
lasso,Lasso Regression,846037.1,2497022000000.0,1526671.0,0.1842,4.2325,1666.7658,0.026
llar,Lasso Least Angle Regression,848306.1,2499445000000.0,1527313.0,0.1837,4.2339,1666.4888,0.022
lar,Least Angle Regression,852738.4,2506606000000.0,1529887.0,0.1805,4.2439,1718.0559,0.042
rf,Random Forest Regressor,685946.2,2517612000000.0,1530683.0,0.18,3.0319,724.132,0.616
huber,Huber Regressor,619076.6,2968014000000.0,1653102.0,0.0478,5.6,2787.909,0.036


INFO:logs:create_model_container: 18
INFO:logs:master_model_container: 18
INFO:logs:display_container: 2
INFO:logs:[ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=4809, selection='cyclic', tol=0.0001, warm_start=False), OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=None,
                          normalize=True, precompute='auto', tol=None), LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)]
INFO:logs:compare_models() succesfully completed......................................


In [109]:
blender_top3 = blend_models(top3)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,643518.0,1037531000000.0,1018593.0,0.3619,4.0553,295.4815
1,741603.5,1236179000000.0,1111836.0,0.282,3.6612,216.2069
2,619320.0,941319900000.0,970216.4,-0.0992,4.3799,7494.5472
3,716885.9,1889900000000.0,1374736.0,0.2989,3.4719,121.5554
4,499826.7,575574000000.0,758665.9,-1.0141,3.8553,291.6604
5,746642.1,1684695000000.0,1297958.0,0.3178,3.9716,1230.6348
6,940039.7,5519856000000.0,2349437.0,0.3067,3.5628,109.8148
7,835745.9,3448485000000.0,1857010.0,0.2041,4.5675,7769.9282
8,1000298.0,3657346000000.0,1912419.0,0.3217,4.482,573.3951
9,927283.1,2825044000000.0,1680787.0,0.3984,3.8578,353.1475


INFO:logs:create_model_container: 19
INFO:logs:master_model_container: 19
INFO:logs:display_container: 3
INFO:logs:VotingRegressor(estimators=[('en',
                             ElasticNet(alpha=1.0, copy_X=True,
                                        fit_intercept=True, l1_ratio=0.5,
                                        max_iter=1000, normalize=False,
                                        positive=False, precompute=False,
                                        random_state=4809, selection='cyclic',
                                        tol=0.0001, warm_start=False)),
                            ('omp',
                             OrthogonalMatchingPursuit(fit_intercept=True,
                                                       n_nonzero_coefs=None,
                                                       normalize=True,
                                                       precompute='auto',
                                                       tol=None)),
               

In [110]:
pred = predict_model(blender_top3, data=test)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=VotingRegressor(estimators=[('en',
                             ElasticNet(alpha=1.0, copy_X=True,
                                        fit_intercept=True, l1_ratio=0.5,
                                        max_iter=1000, normalize=False,
                                        positive=False, precompute=False,
                                        random_state=4809, selection='cyclic',
                                        tol=0.0001, warm_start=False)),
                            ('omp',
                             OrthogonalMatchingPursuit(fit_intercept=True,
                                                       n_nonzero_coefs=None,
                                                       normalize=True,
                                                       precompute='auto',
                                                       tol=None)),
                            ('lr',
                      

In [112]:
submission['box_off_num'] = pred.Label

In [113]:
submission.to_csv('AutoML.csv', index=False)