In [21]:
import pandas as pd
import numpy as np
import random
import os
import math

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [22]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [23]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [5]:
train.head()

Unnamed: 0,id,temperature,pressure,humidity,wind_speed,wind_direction,precipitation,snowing,cloudiness,target
0,TRAIN_00000,27.8816,1013.6,46.8,2.4,232.4,0.0,False,0.0,1.18
1,TRAIN_00001,5.754,1026.6,76.6,3.6,172.0,0.0,False,8.8,8.581
2,TRAIN_00002,20.822,1016.2,64.8,2.2,206.0,0.06,False,23.0,3.978
3,TRAIN_00003,20.0758,1017.4,72.6,0.8,215.4,0.0,False,0.0,8.301
4,TRAIN_00004,7.526,1023.2,82.8,1.2,158.0,0.0,False,15.0,1.692


In [24]:
train_X = train.drop(['id', 'target'], axis = 1)
train_y = train['target']

test_X = test.drop('id', axis = 1)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(train_X['snowing'])
train_X['snowing'] = le.transform(train_X['snowing'])

for label in np.unique(test_X['snowing']):
    if label not in le.classes_:
        le.classes_ = np.append(le.classes_, label)
    test_X['snowing'] = le.transform(test_X['snowing'])

train_X.head(5)

Unnamed: 0,temperature,pressure,humidity,wind_speed,wind_direction,precipitation,snowing,cloudiness
0,27.8816,1013.6,46.8,2.4,232.4,0.0,0,0.0
1,5.754,1026.6,76.6,3.6,172.0,0.0,0,8.8
2,20.822,1016.2,64.8,2.2,206.0,0.06,0,23.0
3,20.0758,1017.4,72.6,0.8,215.4,0.0,0,0.0
4,7.526,1023.2,82.8,1.2,158.0,0.0,0,15.0


In [29]:
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size = 0.3)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(13492, 8) (5783, 8) (13492,) (5783,)


# Random Forest Regressor

In [33]:
params = {
    'n_estimators':[10, 50, 100, 200],
    'max_depth' : [i for i in range(2, 10)],
    'min_samples_leaf' : [2, 4, 6, 8, 10],
    'min_samples_split' : [2, 4, 6, 8, 10]
}

rf_run = RandomForestRegressor(n_jobs = -1)
grid_cv = GridSearchCV(rf_run, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, y_train)
 
print('최적 하이퍼 파라미터:', grid_cv.best_params_)
print('최적 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터: {'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 200}
최적 예측 정확도: 0.2274


In [34]:
rf_run = RandomForestRegressor(max_depth = 9,
min_samples_leaf = 2,
min_samples_split = 4,
n_estimators = 200)

rf_run.fit(X_train, y_train)

In [35]:
train_predict = rf_run.predict(X_train)
valid_predict = rf_run.predict(X_valid)

print("train RMSE':{}".format(math.sqrt(mean_squared_error(train_predict, y_train))) )
print("validation RMSE':{}".format(math.sqrt(mean_squared_error(valid_predict, y_valid))) )
 
# predict
rf_run_pred = rf_run.predict(test_X)

train RMSE':2.523963067934637
validation RMSE':2.79709588035188


In [36]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['target'] = rf_run_pred
submit.head()

Unnamed: 0,id,target
0,TEST_00000,4.760752
1,TEST_00001,5.500032
2,TEST_00002,6.113887
3,TEST_00003,4.548277
4,TEST_00004,5.540214


In [37]:
submit.to_csv('./submission/submit_rf_gridsearchcv.csv', index = False)

# StackingRegressor

In [38]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

rf = RandomForestRegressor(max_depth = 9, min_samples_leaf = 2, min_samples_split = 4, n_estimators = 200)
gbr = GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.1)
svr = SVR(kernel = 'linear')
meta_model = RandomForestRegressor()

stack = [('rf', rf), ('gbr', gbr), ('svr', svr)]

stacked_model = StackingRegressor(estimators = stack, final_estimator = meta_model)
stacked_model.fit(X_train, y_train)

In [39]:
stacked_pred_train = stacked_model.predict(X_train)
stacked_pred_valid = stacked_model.predict(X_valid)

print("train RMSE':{}".format(math.sqrt(mean_squared_error(stacked_pred_train, y_train))) )
print("validation RMSE':{}".format(math.sqrt(mean_squared_error(stacked_pred_valid, y_valid))) )
 
# predict
stacked_pred = stacked_model.predict(test_X)

train RMSE':2.6255940290924134
validation RMSE':2.962478276422581


In [40]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['target'] = stacked_pred
submit.head()

Unnamed: 0,id,target
0,TEST_00000,4.79709
1,TEST_00001,6.83895
2,TEST_00002,7.47238
3,TEST_00003,5.26223
4,TEST_00004,6.52631


In [41]:
submit.to_csv('./submission/submit_stacking.csv', index = False)