In [None]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/bike-sharing-demand/train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='count')#, id_name='Id')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

## EDA

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
change_cols = ['season', 'holiday', 'workingday', 'weather']

In [None]:
for c in change_cols:
    X_train[c] = X_train[c].astype('object')
    X_test[c] = X_test[c].astype('object')

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

## Preprocessing

In [None]:
X_train = X_train.drop(columns = ['datetime'])
X_test = X_test.drop(columns = ['datetime'])
X_train.shape, X_test.shape

In [None]:
cols = X_train.select_dtypes("object").columns

for col in cols:
    print("\n=====", col, "=====")
    left = X_train[col].value_counts()
    right = X_test[col].value_counts()
    print(pd.merge(left=left, right=right, left_on=left.index, right_on=right.index))
    print('\n')

In [None]:
numeric_cols = X_train.select_dtypes(exclude="object").columns

def OutlierDetect(name, df):
    
    print("\n=====", name, "=====")
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1        
        outliers = (df[col] < Q1 - 1.5 * IQR ) | (Q3 + 1.5 * IQR < df[col])        
        print(col, "이상치 비율: ", round(len(df[outliers])/ len(df[col])*100, 2), '%')
        df.loc[outliers, col] = df[col].median()
        
OutlierDetect('train', X_train)
OutlierDetect('test', X_test)
        

## one-hot encoding

In [None]:
X_train = pd.get_dummies(X_train, columns=cols)
X_test = pd.get_dummies(X_test, columns=cols)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_test['weather_4'] = 0

## Train-Test-split

In [None]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['count'], test_size = 0.2, random_state = 2021)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

## Modeling

In [None]:
# GridSearch 

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# rf_rgs = RandomForestRegressor()

# rf_params = {'n_estimators': [10, 50, 100], 
#            'max_depth' : [7, 10, 15],
#             "max_features": [8, 10, 15],
#            'min_samples_split' : [4, 6, 8]}

# rf_grid = RandomizedSearchCV(rf_rgs, rf_params, n_jobs = -1, cv = 5, verbose = 2, n_iter =10).fit(X_tr, y_tr)

# rf_grid.best_params_

In [None]:
from sklearn.metrics import mean_squared_error

rf_model = RandomForestRegressor(n_estimators = 50, min_samples_split = 6, max_features = 8, max_depth = 10)
rf_model.fit(X_tr, y_tr)
rf_pred = rf_model.predict(X_val)

print(round(np.sqrt(mean_squared_error(y_val, rf_pred))))

In [None]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators = 50, max_depth = 10)
xgb_model.fit(X_tr, y_tr)
xgb_pred = xgb_model.predict(X_val)
  
print(round(np.sqrt(mean_squared_error(y_val, xgb_pred))))

In [None]:
final_model = RandomForestRegressor(n_estimators = 50, min_samples_split = 6, max_features = 8, max_depth = 10)
final_model.fit(X_train, y_train['count'])
prediction = final_model.predict(X_test)

submission = pd.DataFrame({
    'id' : y_test['id'],
    'count' : prediction
})

submission.head()

In [None]:
submission.to_csv("12345.csv", index = False)

In [None]:
print(round(np.sqrt(mean_squared_error(y_test['count'], prediction))))