In [8]:
import pandas as pd
import numpy as np
import random
import os
import math

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold
from sklearn.metrics import mean_absolute_error

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [5]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [6]:
train.head()

Unnamed: 0,id,temperature,pressure,humidity,wind_speed,wind_direction,precipitation,snowing,cloudiness,target
0,TRAIN_00000,27.8816,1013.6,46.8,2.4,232.4,0.0,False,0.0,1.18
1,TRAIN_00001,5.754,1026.6,76.6,3.6,172.0,0.0,False,8.8,8.581
2,TRAIN_00002,20.822,1016.2,64.8,2.2,206.0,0.06,False,23.0,3.978
3,TRAIN_00003,20.0758,1017.4,72.6,0.8,215.4,0.0,False,0.0,8.301
4,TRAIN_00004,7.526,1023.2,82.8,1.2,158.0,0.0,False,15.0,1.692


In [7]:
train_X = train.drop(['id', 'target'], axis = 1)
train_y = train['target']

test_X = test.drop('id', axis = 1)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(train_X['snowing'])
train_X['snowing'] = le.transform(train_X['snowing'])

for label in np.unique(test_X['snowing']):
    if label not in le.classes_:
        le.classes_ = np.append(le.classes_, label)
    test_X['snowing'] = le.transform(test_X['snowing'])

train_X.head(5)

Unnamed: 0,temperature,pressure,humidity,wind_speed,wind_direction,precipitation,snowing,cloudiness
0,27.8816,1013.6,46.8,2.4,232.4,0.0,0,0.0
1,5.754,1026.6,76.6,3.6,172.0,0.0,0,8.8
2,20.822,1016.2,64.8,2.2,206.0,0.06,0,23.0
3,20.0758,1017.4,72.6,0.8,215.4,0.0,0,0.0
4,7.526,1023.2,82.8,1.2,158.0,0.0,0,15.0


In [11]:
X = np.array(train_X)
y  = train_y

kf = KFold(n_splits = 10, shuffle = True)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [29]:
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size = 0.3)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(13492, 8) (5783, 8) (13492,) (5783,)


# RandomForest Regressor

In [12]:
history = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = RandomForestRegressor(n_estimators = 5, max_depth = 10, min_samples_leaf = 4, min_samples_split = 4)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    history.append(mean_absolute_error(y_pred, y_test))

print("각 분할의 MAE 기록 :", history)
print("평균 MAE :", np.mean(history))


각 분할의 정확도 기록 : [2.2211314265247046, 2.15745289540198, 2.1635049564215563, 2.224207495167557, 2.3070518445777863, 2.176233193075797, 2.2622865330545214, 2.235996738878551, 2.235553516426164, 2.1593646828672215]
평균 정확도 : 2.214278328239584


# xgbregressor

In [17]:
import xgboost

xgb_history = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgboost.XGBRegressor(n_estimators = 100, learning_rate = 0.08, gamma = 0, subsample = 0.75,
                                 colsample_bytree = 1, max_depth = 7)
    xgb_model.fit(X_train, y_train)

    y_pred = xgb_model.predict(X_test)
    xgb_history.append(mean_absolute_error(y_pred, y_test))

print("각 분할의 MAE 기록 :", xgb_history)
print("xgboost 평균 MAE :", np.mean(xgb_history))


각 분할의 정확도 기록 : [2.147238798621779, 2.1091607829401604, 2.0839776723063337, 2.1869606414232017, 2.141244092244825, 2.100634103595375, 2.1424366331778475, 2.144260598993326, 2.135823174059422, 2.1343504088035075]
xgboost 평균 정확도 : 2.1326086906165775


In [18]:
import xgboost

xgb_history = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgboost.XGBRegressor(n_estimators = 100, learning_rate = 0.08, gamma = 0, subsample = 0.75,
                                     colsample_bytree = 1, max_depth = 7, eta = 0.1, eval_metric = 'mae')
    xgb_model.fit(X_train, y_train)

    y_pred = xgb_model.predict(X_test)
    xgb_history.append(mean_absolute_error(y_pred, y_test))

print("각 분할의 MAE 기록 :", xgb_history)
print("xgboost 평균 MAE :", np.mean(xgb_history))


각 분할의 MAE 기록 : [2.163715243377626, 2.1213034611801884, 2.0146955141575997, 2.1698621713793624, 2.195672443064417, 2.119460921184597, 2.100841874070655, 2.1141640489238744, 2.1583622728307197, 2.1060761039829896]
xgboost 평균 MAE : 2.1264154054152025


In [57]:
import xgboost

xgb_history = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgboost.XGBRegressor(n_estimators = 50, learning_rate = 0.05, gamma = 0, subsample = 0.75,
                                     colsample_bytree = 1, max_depth = 10, eta = 0.1, eval_metric = 'mae')
    xgb_model.fit(X_train, y_train)

    y_pred = xgb_model.predict(X_test)
    xgb_history.append(mean_absolute_error(y_pred, y_test))

print("각 분할의 MAE 기록 :", xgb_history)
print("xgboost 평균 MAE :", np.mean(xgb_history))


각 분할의 MAE 기록 : [2.1561298373358873, 2.1181693638597783, 2.123151361091008, 2.093192208640803, 2.1282847796366915, 2.102128319951618, 2.0850064442060003, 2.1073106657205374, 2.039328862089287, 2.1381969700696]
xgboost 평균 MAE : 2.1090898812601213


In [59]:
xgb_pred = xgb_model.predict(test_X)
submit = pd.read_csv('./data/sample_submission.csv')
submit['target'] = xgb_pred
submit.head()

Unnamed: 0,id,target
0,TEST_00000,4.546853
1,TEST_00001,4.544888
2,TEST_00002,5.682084
3,TEST_00003,4.292146
4,TEST_00004,5.526654


In [61]:
submit.to_csv('./submission/submit_xgb.csv', index = False)

# 피처 선택

In [70]:
xgb_model.feature_importances_

array([0.08204587, 0.08940826, 0.07785956, 0.31594583, 0.09196253,
       0.09120276, 0.16873308, 0.08284205], dtype=float32)

In [71]:
train_X.columns

Index(['temperature', 'pressure', 'humidity', 'wind_speed', 'wind_direction',
       'precipitation', 'snowing', 'cloudiness'],
      dtype='object')

In [76]:
#wind_speed, snowing, wind_direction, precipitation
new_train_X = train[['wind_speed', 'snowing', 'wind_direction', 'precipitation']]
new_train_y = train['target']

new_test_X = test[['wind_speed', 'snowing', 'wind_direction', 'precipitation']]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(new_train_X['snowing'])
new_train_X['snowing'] = le.transform(new_train_X['snowing'])

for label in np.unique(new_test_X['snowing']):
    if label not in le.classes_:
        le.classes_ = np.append(le.classes_, label)
    new_test_X['snowing'] = le.transform(new_test_X['snowing'])

new_train_X.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_X['snowing'] = le.transform(new_train_X['snowing'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test_X['snowing'] = le.transform(new_test_X['snowing'])


Unnamed: 0,wind_speed,snowing,wind_direction,precipitation
0,2.4,0,232.4,0.0
1,3.6,0,172.0,0.0
2,2.2,0,206.0,0.06
3,0.8,0,215.4,0.0
4,1.2,0,158.0,0.0


In [78]:
new_X = np.array(new_train_X)
new_y  = new_train_y

kf = KFold(n_splits = 10, shuffle = True)

for train_index, test_index in kf.split(new_X):
    new_X_train, new_X_test = new_X[train_index], new_X[test_index]
    new_y_train, new_y_test = new_y[train_index], new_y[test_index]

In [79]:
import xgboost

new_xgb_history = []

for train_index, test_index in kf.split(new_X):

    new_X_train, new_X_test = new_X[train_index], new_X[test_index]
    new_y_train, new_y_test = new_y[train_index], new_y[test_index]

    new_xgb_model = xgboost.XGBRegressor(n_estimators = 50, learning_rate = 0.05, gamma = 0, subsample = 0.75,
                                     colsample_bytree = 1, max_depth = 10, eta = 0.1, eval_metric = 'mae')
    new_xgb_model.fit(new_X_train, new_y_train)

    new_y_pred = new_xgb_model.predict(new_X_test)
    new_xgb_history.append(mean_absolute_error(new_y_pred, new_y_test))

print("각 분할의 MAE 기록 :", new_xgb_history)
print("xgboost 평균 MAE :", np.mean(new_xgb_history))


각 분할의 MAE 기록 : [2.3327358487621876, 2.3591849655392756, 2.367902381088723, 2.349895808501857, 2.3292550634066593, 2.318583143729154, 2.3457182082031203, 2.3456883250495344, 2.318581778317953, 2.348110047016233]
xgboost 평균 MAE : 2.34156555696147


In [80]:
new_xgb_pred = new_xgb_model.predict(new_test_X)
submit = pd.read_csv('./data/sample_submission.csv')
submit['target'] = new_xgb_pred
submit.head()

Unnamed: 0,id,target
0,TEST_00000,4.500935
1,TEST_00001,5.152166
2,TEST_00002,6.010781
3,TEST_00003,4.271572
4,TEST_00004,4.359322


In [82]:
submit.to_csv('./submission/submit_xgb_new.csv', index = False)

# 표준화

In [25]:
from sklearn.preprocessing import StandardScaler, RobustScaler
std_scaler = StandardScaler()
std_X = std_scaler.fit_transform(X)
std_X

array([[ 1.4728183 , -0.01817001, -1.49084221, ..., -0.39653084,
        -0.08079242, -1.46071717],
       [-1.6036996 , -0.01306008,  0.5150079 , ..., -0.39653084,
        -0.08079242, -0.96151574],
       [ 0.49128457, -0.01714802, -0.2792549 , ..., -0.08581968,
        -0.08079242, -0.15598617],
       ...,
       [-0.49225127, -0.0235944 , -0.26579282, ...,  1.4677361 ,
        -0.08079242,  1.54583687],
       [ 0.28718072, -0.0158902 , -1.58507678, ..., -0.08581968,
        -0.08079242, -0.32616848],
       [-0.49449495, -0.02186488,  0.85155993, ..., -0.08581968,
        -0.08079242,  1.04663544]])

In [39]:
X_train_std = pd.DataFrame(std_X ,columns = train_X.columns)
std_X = np.array(X_train_std)
std_y  = train_y

print(X_train_std.shape, std_X.shape, std_y.shape)

kf = KFold(n_splits = 10, shuffle = True)

for train_index, test_index in kf.split(std_X):
    
    std_X_train, std_X_test = std_X[train_index], std_X[test_index]
    std_y_train, std_y_test = std_y[train_index], std_y[test_index]

print(std_X_train.shape, std_X_test.shape, std_y_train.shape, std_y_test.shape)

(19275, 8) (19275, 8) (19275,)
(17348, 8) (1927, 8) (17348,) (1927,)


In [42]:
import xgboost

std_xgb_history = []

for train_index, test_index in kf.split(std_X):

    std_X_train, std_X_test = std_X[train_index], std_X[test_index]
    std_y_train, std_y_test = std_y[train_index], std_y[test_index]

    xgb_model = xgboost.XGBRegressor(n_estimators = 50, learning_rate = 0.05, gamma = 0, subsample = 0.75,
                                     colsample_bytree = 1, max_depth = 10, eta = 0.1, eval_metric = 'mae')
    xgb_model.fit(std_X_train, std_y_train)

    std_y_pred = xgb_model.predict(std_X_test)
    std_xgb_history.append(mean_absolute_error(std_y_pred, std_y_test))

print("각 분할의 MAE 기록 :", std_xgb_history)
print("xgboost 평균 MAE :", np.mean(std_xgb_history))


각 분할의 MAE 기록 : [2.145942907053405, 2.017546757695586, 2.092173467260179, 2.172933436274034, 2.0423396994543275, 2.10673512335274, 2.111686885500672, 2.111497683711168, 2.149057383981219, 2.144541212644612]
xgboost 평균 MAE : 2.109445455692794


In [44]:
import xgboost

std_xgb_history = []

for train_index, test_index in kf.split(std_X):

    std_X_train, std_X_test = std_X[train_index], std_X[test_index]
    std_y_train, std_y_test = std_y[train_index], std_y[test_index]

    xgb_model = xgboost.XGBRegressor(n_estimators = 100, learning_rate = 0.01, gamma = 0, subsample = 0.75,
                                     colsample_bytree = 1, max_depth = 10, eta = 0.1, eval_metric = 'mae')
    xgb_model.fit(std_X_train, std_y_train)

    std_y_pred = xgb_model.predict(std_X_test)
    std_xgb_history.append(mean_absolute_error(std_y_pred, std_y_test))

print("각 분할의 MAE 기록 :", std_xgb_history)
print("xgboost 평균 MAE :", np.mean(std_xgb_history))


각 분할의 MAE 기록 : [2.572634179364596, 2.613520151843668, 2.626659084976956, 2.5472003165958332, 2.5343171444171695, 2.589167132611218, 2.501800306668106, 2.498886892166296, 2.5057603456420976, 2.5543426031420498]
xgboost 평균 MAE : 2.5544288157427992


In [45]:
import xgboost

std_xgb_history = []

for train_index, test_index in kf.split(std_X):

    std_X_train, std_X_test = std_X[train_index], std_X[test_index]
    std_y_train, std_y_test = std_y[train_index], std_y[test_index]

    xgb_model = xgboost.XGBRegressor(n_estimators = 100, learning_rate = 0.08, gamma = 0, subsample = 0.75,
                                     colsample_bytree = 1, max_depth = 5, eta = 0.1, eval_metric = 'mae')
    xgb_model.fit(std_X_train, std_y_train)

    std_y_pred = xgb_model.predict(std_X_test)
    std_xgb_history.append(mean_absolute_error(std_y_pred, std_y_test))

print("각 분할의 MAE 기록 :", std_xgb_history)
print("xgboost 평균 MAE :", np.mean(std_xgb_history))


각 분할의 MAE 기록 : [2.2397913612468607, 2.215826148523829, 2.147911403531355, 2.2097894885164076, 2.1505367661976713, 2.2328149728433537, 2.254095837678696, 2.19259105597547, 2.194956750158083, 2.222693395088049]
xgboost 평균 MAE : 2.2061007179759775


In [46]:
import xgboost

std_xgb_history = []

for train_index, test_index in kf.split(std_X):

    std_X_train, std_X_test = std_X[train_index], std_X[test_index]
    std_y_train, std_y_test = std_y[train_index], std_y[test_index]

    xgb_model = xgboost.XGBRegressor(n_estimators = 70, learning_rate = 0.05, gamma = 0, subsample = 0.75,
                                     colsample_bytree = 1, max_depth = 5, eta = 0.1, eval_metric = 'mae')
    xgb_model.fit(std_X_train, std_y_train)

    std_y_pred = xgb_model.predict(std_X_test)
    std_xgb_history.append(mean_absolute_error(std_y_pred, std_y_test))

print("각 분할의 MAE 기록 :", std_xgb_history)
print("xgboost 평균 MAE :", np.mean(std_xgb_history))


각 분할의 MAE 기록 : [2.229506760955352, 2.267197287599081, 2.209854504611977, 2.2494290732595434, 2.235188006371383, 2.1825461512636384, 2.2411997901845235, 2.2941809103114035, 2.251879190161863, 2.20248025279102]
xgboost 평균 MAE : 2.2363461927509785


# Random Forest Regressor GridCV

In [33]:
params = {
    'n_estimators':[10, 50, 100, 200],
    'max_depth' : [i for i in range(2, 10)],
    'min_samples_leaf' : [2, 4, 6, 8, 10],
    'min_samples_split' : [2, 4, 6, 8, 10]
}

rf_run = RandomForestRegressor(n_jobs = -1)
grid_cv = GridSearchCV(rf_run, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, y_train)
 
print('최적 하이퍼 파라미터:', grid_cv.best_params_)
print('최적 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터: {'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 200}
최적 예측 정확도: 0.2274


In [34]:
rf_run = RandomForestRegressor(max_depth = 9,
min_samples_leaf = 2,
min_samples_split = 4,
n_estimators = 200)

rf_run.fit(X_train, y_train)

In [35]:
train_predict = rf_run.predict(X_train)
valid_predict = rf_run.predict(X_valid)

print("train RMSE':{}".format(math.sqrt(mean_squared_error(train_predict, y_train))) )
print("validation RMSE':{}".format(math.sqrt(mean_squared_error(valid_predict, y_valid))) )
 
# predict
rf_run_pred = rf_run.predict(test_X)

train RMSE':2.523963067934637
validation RMSE':2.79709588035188


In [36]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['target'] = rf_run_pred
submit.head()

Unnamed: 0,id,target
0,TEST_00000,4.760752
1,TEST_00001,5.500032
2,TEST_00002,6.113887
3,TEST_00003,4.548277
4,TEST_00004,5.540214


In [37]:
submit.to_csv('./submission/submit_rf_gridsearchcv.csv', index = False)

# StackingRegressor

In [38]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

rf = RandomForestRegressor(max_depth = 9, min_samples_leaf = 2, min_samples_split = 4, n_estimators = 200)
gbr = GradientBoostingRegressor(n_estimators = 100, learning_rate = 0.1)
svr = SVR(kernel = 'linear')
meta_model = RandomForestRegressor()

stack = [('rf', rf), ('gbr', gbr), ('svr', svr)]

stacked_model = StackingRegressor(estimators = stack, final_estimator = meta_model)
stacked_model.fit(X_train, y_train)

In [39]:
stacked_pred_train = stacked_model.predict(X_train)
stacked_pred_valid = stacked_model.predict(X_valid)

print("train RMSE':{}".format(math.sqrt(mean_squared_error(stacked_pred_train, y_train))) )
print("validation RMSE':{}".format(math.sqrt(mean_squared_error(stacked_pred_valid, y_valid))) )
 
# predict
stacked_pred = stacked_model.predict(test_X)

train RMSE':2.6255940290924134
validation RMSE':2.962478276422581


In [40]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['target'] = stacked_pred
submit.head()

Unnamed: 0,id,target
0,TEST_00000,4.79709
1,TEST_00001,6.83895
2,TEST_00002,7.47238
3,TEST_00003,5.26223
4,TEST_00004,6.52631


In [41]:
submit.to_csv('./submission/submit_stacking.csv', index = False)