#### 이번 대회에서는 범주형 변수 전처리를 위해 One-Hot Encoding과 for문을 사용했습니다.

#### 이는 train data로 fit한 One-Hot Encoder로 test data를 transform할 경우,
#### train data에는 속하지 않은 데이터가 test data에 있을 가능성이 있어 에러가 발생할 수 있기 때문입니다.

#### 이를 방지하기 위해 예외적인 상황에 대처할 수 있는 코드를 삽입해서 이중 for문을 작성했습니다.
#### 참고해 주시길 바랍니다.

# Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge

# Fixed RandomSeed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Data Load

In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [46]:
!unzip -qq "/content/gdrive/MyDrive/브동산.zip"

replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [47]:
train_df = pd.read_csv('/content/train.csv')

In [48]:
train_x = train_df.drop(columns=['ID'])
train_y = train_df['monthlyRent(us_dollar)']

In [49]:
test_x =  pd.read_csv('/content/test.csv').drop(columns=['ID'])

# Data Pre-processing

#### 범주형 변수는 크게 명목형 변수와 순서형 변수로 나눌 수 있습니다.
#### 순서형 변수의 경우 그 순서대로 수치값을 레이블로 부여하여 간단히 수치화 할 수 있지만,
#### 명목형 변수의 경우 값들의 순서 관계가 없어 수치 레이블링으로는 그 관계를 정확히 표현할 수 없습니다.
#### 그렇기에 명목형 변수의 경우 값들 각각을 새로운 컬럼으로 만들고, 원래 해당하던 값에는 1을,
#### 아닐 경우 0을 부여하는 One-Hot Encoding 방법이 존재합니다. 

In [50]:
# qualitative column one-hot encoding
qual_col = ['propertyType','suburbName']
ohe = OneHotEncoder(sparse=False)

for i in qual_col:
    train_x = pd.concat([train_x, pd.DataFrame(ohe.fit_transform(train_x[[i]]), columns=ohe.categories_[0])], axis=1)
    
    for qual_value in np.unique(test_x[i]): 
        if qual_value not in np.unique(ohe.categories_): 
            ohe.categories_ = np.append(ohe.categories_, qual_value)
    # One Hot Encoder가 Test 데이터로부터 Fitting되는 것은 Data Leakage이므로, Test 데이터에는 Train 데이터로 Fitting된 One Hot Encoder로부터 transform만 수행되어야 합니다.
    test_x = pd.concat([test_x, pd.DataFrame(ohe.transform(test_x[[i]]), columns=ohe.categories_[0])], axis=1)
    
train_x = train_x.drop(qual_col, axis=1)
test_x = test_x.drop(qual_col, axis=1)
print('Done.')

Done.


In [40]:
!pip install --pre pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0rc5-py3-none-any.whl (501 kB)
[K     |████████████████████████████████| 501 kB 5.1 MB/s 
Collecting plotly-resampler>=0.7.2.2
  Downloading plotly_resampler-0.8.3.tar.gz (45 kB)
[K     |████████████████████████████████| 45 kB 3.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting psutil>=5.9.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[K     |████████████████████████████████| 280 kB 42.2 MB/s 
Collecting scikit-plot>=0.3.7
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting sktime>=0.14.0
  Downloading sktime-0.14.1-py3-none-any.whl (15.9 MB)
[K     |████████████████████████████████| 15.9 MB 62.2 MB/

In [41]:
from pycaret.regression import *

In [51]:
train_x.columns

Index(['bedrooms', 'latitude', 'longitude', 'distanceMetro(km)',
       'distanceAirport(km)', 'distanceHospital(km)', 'distanceRailway(km)',
       'area(square_meters)', 'monthlyRent(us_dollar)', 'Apartment',
       'Independent Floor', 'Independent House', 'Villa', 'Delhi Central',
       'Delhi East', 'Delhi North', 'Delhi South', 'Delhi West', 'Dwarka',
       'North Delhi', 'North West Delhi', 'Other', 'Rohini',
       'South West Delhi', 'West Delhi'],
      dtype='object')

In [42]:
from pycaret import regression

In [53]:
regression_setup = regression.setup(data=train_x, target='monthlyRent(us_dollar)',train_size = 0.8)

Unnamed: 0,Description,Value
0,Session id,4656
1,Target,monthlyRent(us_dollar)
2,Target type,Regression
3,Data shape,"(8692, 25)"
4,Train data shape,"(6953, 25)"
5,Test data shape,"(1739, 25)"
6,Numeric features,24
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [54]:
top5_model = compare_models(sort = 'MAE', n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,60.1765,10154.1796,100.6683,0.8059,0.2692,0.2089,3.222
lightgbm,Light Gradient Boosting Machine,60.2513,9877.4184,99.2641,0.8109,0.2655,0.2103,0.219
et,Extra Trees Regressor,60.5762,10326.3182,101.5844,0.8023,0.2713,0.2098,1.615
gbr,Gradient Boosting Regressor,63.5504,10227.0516,101.0439,0.8042,0.2784,0.2275,1.302
knn,K Neighbors Regressor,67.4788,12808.4408,113.0467,0.7551,0.2979,0.2329,0.137
dt,Decision Tree Regressor,77.8338,18224.0098,134.5832,0.6529,0.3504,0.2639,0.077
huber,Huber Regressor,82.0243,19228.8495,137.3598,0.6311,0.4035,0.2887,0.185
lasso,Lasso Regression,85.4873,20043.4387,138.8595,0.6145,0.4451,0.3262,0.032
ridge,Ridge Regression,85.7489,33000.446,158.6348,0.3574,0.4597,0.3359,0.029
lr,Linear Regression,86.0278,41071.9398,167.2334,0.1973,0.4575,0.3397,0.67


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

# Model Hyperparameter Setting

#### 대부분의 모델들은 사람이 직접 설정할 수 있는 Hyperparameter를 가지고 있습니다.
#### 이런 Hyperparameter에 어떤 값이 설정되는가에 따라 모델의 성능은 크게 차이나게 됩니다. 
#### 본 Baseline에서 제공한 Ridge Regression 모델에서는 alpha를 Hyperparameter로 제공했습니다. 
#### alpha는 모델의 규제항으로, 모델의 오버피팅을 방지하는 역할을 합니다.


In [25]:
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
from tqdm import tqdm
from time import sleep

https://dacon.io/en/competitions/open/235698/talkboard/404315

In [30]:
#GRIDSEARCH를 이용한 최적화
 
params = {
    'n_estimators':(50,42),
    'max_depth' : (5,6,7, 8),
    'random_state' : (0,42,50,100),
    'min_samples_leaf' : (2,8,9,10, 18),
    'min_samples_split' : (2,8,9,10, 16)
}
rf_run = RandomForestRegressor(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_run, param_grid=params, cv=2, n_jobs=-1)
tqdm(grid_cv.fit(train_x, train_y))
 
 
print('최적 하이퍼 파라미터:', grid_cv.best_params_)
print('최적 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

0it [00:00, ?it/s]

최적 하이퍼 파라미터: {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 42, 'random_state': 50}
최적 예측 정확도: 0.7965





In [31]:
model = RandomForestRegressor(random_state=50, max_depth=8, min_samples_leaf=2, min_samples_split=2,n_estimators=42)
model.fit(train_x, train_y)

RandomForestRegressor(max_depth=8, min_samples_leaf=2, n_estimators=42,
                      random_state=50)

# Model Fit

In [34]:
predictions = model.predict(test_x)

# Prediction

# Submit

In [32]:
submission = pd.read_csv('/content/sample_submission.csv')

In [35]:
submission

Unnamed: 0,ID,monthlyRent(us_dollar)
0,Test_0000,0
1,Test_0001,0
2,Test_0002,0
3,Test_0003,0
4,Test_0004,0
...,...,...
8688,Test_8688,0
8689,Test_8689,0
8690,Test_8690,0
8691,Test_8691,0


In [36]:
submission['monthlyRent(us_dollar)'] = predictions

In [37]:
submission

Unnamed: 0,ID,monthlyRent(us_dollar)
0,Test_0000,184.035403
1,Test_0001,231.951574
2,Test_0002,355.842142
3,Test_0003,185.487237
4,Test_0004,92.185731
...,...,...
8688,Test_8688,367.160479
8689,Test_8689,354.917172
8690,Test_8690,214.934788
8691,Test_8691,156.651634


In [38]:
submission.to_csv('./submit.csv', index=False)