# 1. Data Loading & Checking

> #### 1.1 Data Loading
> #### 1.2 Train, Test Data 정보 확인
> #### 1.3 결측치 확인
> #### 1.4 Feature 타입 확인 

## 1.1 Data Loading

In [None]:
# 초기 설정
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from pathlib import Path
from IPython.display import display
from datetime import datetime
from pandas import DataFrame
from typing import List, NamedTuple, Tuple

# allow plots to appear directly in the notebook
%matplotlib 

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [None]:
# 필요 함수 선언
def load(path: Path) -> DataFrame:
#     return pd.read_csv(path)
    return pd.read_csv(path, parse_dates=True, index_col="datetime")

In [None]:
ROOT_DIR = Path("/kaggle/input/bike-sharing-demand")
TRAIN_DATA_PATH = ROOT_DIR / "train.csv"
TEST_DATA_PATH = ROOT_DIR / "test.csv"
TARGET_NAME = 'count'

original_train: DataFrame = load(TRAIN_DATA_PATH)
original_test: DataFrame = load(TEST_DATA_PATH)

## 1.2 Train, Test 정보 확인

In [None]:
original_train.head()

In [None]:
original_test.head()

In [None]:
only_train_columns = [c1 for c1 in original_train.columns if not c1 in original_test.columns and c1 != TARGET_NAME]
print('Only Train Columns')
print(only_train_columns)
only_test_columns = [c1 for c1 in original_test.columns if not c1 in original_train.columns]
print('Only Test Columns')
print(only_test_columns)

## 1.3 결측치 확인

In [None]:
original_train.isnull().sum()

In [None]:
original_test.isnull().sum()

## 1.4 feature 타입 확인

In [None]:
original_train.dtypes

In [None]:
original_test.dtypes

# 2. EDA

> #### 2.1 Target 분포 확인 및 정규화
> #### 2.2 Feature 분포 확인 및 정규화
> #### 2.3 Feature간 관계 확인 및 정규화
> #### 2.4 Feature와 Target의 관계 확인 및 정규화

In [None]:
feature_columns = [c1 for c1 in original_train.columns if c1 in original_test.columns and c1 != TARGET_NAME]
feature_columns

In [None]:
categorical_feature_columns=['season', 'holiday', 'workingday', 'weather',]
numeric_feature_columns=['temp', 'atemp', 'humidity', 'windspeed']

## 2.1 Target 분포 확인

In [None]:
sns.distplot(original_train[TARGET_NAME])

## 2.2 Feature 분포 확인

In [None]:
def draw_distplot(df, name, fig, m, n, idx):
    ax = fig.add_subplot(m, n, idx)
    ax = sns.distplot(df[name])

def draw_distplots(df, columns):
    M = round(len(columns)/2)
    N = 2
    fig = plt.figure(figsize=[N*10, M*6])
    for idx, name in enumerate(columns):
        draw_distplot(df=df, name=name, fig=fig, m=M, n=N, idx=idx+1)
        
draw_distplots(df=original_train, columns=feature_columns)

## 2.3 Feature간 관계 확인

In [None]:
sns.pairplot(original_train[feature_columns])

In [None]:
sns.heatmap(original_train[feature_columns].corr(), annot=True)

## 2.4 Target&Feature 관계 확인

In [None]:
def draw_boxplot(df, x_name, y_name, fig, m, n, idx): 
    ax = fig.add_subplot(m, n, idx)
    ax = sns.boxplot(data=df, x=x_name, y=y_name)

def draw_boxplots(df, x_columns, y_name):
    M = round(len(x_columns)/2)
    N = 2
    fig = plt.figure(figsize=[N*10, M*6])
    for idx, name in enumerate(x_columns):
        draw_boxplot(df=df, x_name=name,y_name=y_name, fig=fig, m=M, n=N, idx=idx+1)
        
draw_boxplots(df=original_train, x_columns=categorical_feature_columns, y_name=TARGET_NAME)

In [None]:
def draw_scatterplot(df, x_name, y_name, fig, m, n, idx): 
    ax = fig.add_subplot(m, n, idx)
    ax = sns.scatterplot(data=df, x=x_name, y=y_name)

def draw_scatterplots(df, x_columns, y_name):
    M = round(len(x_columns)/2)
    N = 2
    fig = plt.figure(figsize=[N*10, M*6])
    for idx, name in enumerate(x_columns):
        draw_scatterplot(df=df, x_name=name,y_name=y_name, fig=fig, m=M, n=N, idx=idx+1)
        
draw_scatterplots(df=original_train, x_columns=numeric_feature_columns, y_name=TARGET_NAME)

# 3. Data Preprocessing

> #### 3.1 결측치 확인 및 처리
> #### 3.2 Feature 타입 변환
> #### 3.3 Column 값 표준화(대소문자, 같은 의미 데이터 통합)



## 3.1 결측치 확인 및 처리

In [None]:
train_data = original_train.copy()
test_data = original_test.copy()

In [None]:
original_train.isnull().sum()

In [None]:
original_test.isnull().sum()

## 3.2 Feature 타입 변환

In [None]:
#categorical columns change to one-hot encoding data

def replaced_with_onehot_cols(data: DataFrame, col_names: List[str]) -> DataFrame:
    data = data.copy()
    
    for col_name in col_names:
        one_hot = pd.get_dummies(data[col_name], prefix=col_name)
        data = data.join(one_hot)
        
        # Original column is not needed anymore
        del data[col_name]
    return data

In [None]:
train_data = replaced_with_onehot_cols(data=train_data, col_names=categorical_feature_columns)
test_data = replaced_with_onehot_cols(data=test_data, col_names=categorical_feature_columns)

In [None]:
train_data.head()

# 4. Feature Engineering

> #### 4.1 Feature 생성/추출/변환
> #### 4.2 Target/Feature 정규화
> #### 4.3 Feature importances 확인

## 4.1 Feature 생성/추출/변환

In [None]:
#remove only_train_columns
train_data = train_data.drop(only_train_columns, axis=1)
train_data.head()

In [None]:
#seperate datetime index
def expanded_index_datetime_col(data: DataFrame) -> DataFrame:
    data = data.copy()
    data["hour"] = data.index.hour
    data["weekday"] = data.index.weekday
    data["month"] = data.index.month
    data["year"] = data.index.year
    return data

In [None]:
train_data = expanded_index_datetime_col(data=train_data)
test_data = expanded_index_datetime_col(data=test_data)
train_data.head()

In [None]:
#change datetime data to one-hot data
datetime_cols = ['hour', 'weekday','month','year']
train_data = replaced_with_onehot_cols(data=train_data, col_names=datetime_cols)
test_data = replaced_with_onehot_cols(data=test_data, col_names=datetime_cols)
train_data.head()

## 4.2 Target/Feature 정규화

In [None]:
from sklearn.preprocessing import MinMaxScaler

def normalize_cols(df: DataFrame, scaler) -> DataFrame:
    df = df.copy()
    return DataFrame(scaler.fit_transform(df.values), columns=df.columns, index=df.index)

x_scaler = MinMaxScaler()
x = train_data.drop(TARGET_NAME, axis=1)
x = normalize_cols(df=x, scaler=x_scaler)

y_scaler = MinMaxScaler()
y = train_data[[TARGET_NAME]]
y = normalize_cols(df=y, scaler=y_scaler)

test_data =  normalize_cols(df=test_data, scaler=x_scaler)

In [None]:
x.head()

In [None]:
y.head()

## 4.3 Feature Importance 확인

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

result = model.fit(x.values, y.values)

In [None]:
features = pd.DataFrame()
features["features"] = x.columns
features["coefficient"] = model.feature_importances_

features.sort_values(by=["coefficient"], ascending=False, inplace=True)
fig,ax= plt.subplots()
fig.set_size_inches(20,20)
sns.barplot(data=features, x="coefficient", y="features");

# 5. Modeling

> #### 5.1 학습
> #### 5.2 모델 선택 및 튜닝

## 5.1 학습

In [None]:
!pip install livelossplot tensorflow-gpu
import tensorflow.keras.backend as K
import tensorflow as tf

In [None]:
#Split train, test data
from sklearn.model_selection import train_test_split
random_seed = 5

x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=random_seed)

print('train data count : ' + str(len(x_train)))
print('test data count : ' + str(len(x_validation)))

In [None]:
#make cost function
from sklearn import metrics
def rmsle_K(y, pred):
    return K.sqrt(K.mean(K.square(tf.math.log1p(y) - tf.math.log1p(pred))))
def rmsle(y, pred):
    return np.sqrt(metrics.mean_squared_error(y, pred))

In [None]:
#make result
class ModelResult():
    def __init__(self, name, cost, model):
        self.name = name
        self.cost = cost
        self.model = model
    def __str__(self):
        return self.name+"\tcost:"+str(self.cost)
results = []

In [None]:
#Make DL Models
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

_, NUM_FEATURES = x_train.shape

def make_dl_model()-> Model:
    input = Input(shape=(NUM_FEATURES, ))
    _ = Dense(32, activation='relu')(input)
    _ = Dropout(0.4)(_)
    _ = Dense(32, activation='relu')(_)
    _ = Dropout(0.4)(_)
    _ = Dense(16, activation='relu')(_)
    output = Dense(1, activation='relu')(_)
    model = Model(inputs=input, outputs=output)
    model.compile(optimizer='adam', loss=rmsle_K, metrics=['mse'])
    return model


In [None]:
model = make_dl_model()
model.fit(x_train,y_train, validation_data=(x_validation,y_validation),
         epochs=200, batch_size=128, verbose=1,
         callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=5, min_lr=0.000001, verbose=1),
                    EarlyStopping(monitor="val_loss", patience=10, verbose=0),
                    ]
         )
results.append(ModelResult(name='DL',cost=rmsle(y_validation, model.predict(x_validation)), model=model))

In [None]:
#Make ML Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

ml_models = {
    'LinearRegression': LinearRegression(),
    'LassoRegression': Lasso(),
    'RidgeRegression': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor()
}

#train ML Models
for name, model in ml_models.items():
    model.fit(x_train, y_train)
    results.append(ModelResult(name=name,cost=rmsle(y_validation, model.predict(x_validation)), model=model))

In [None]:
for r in results:
    print(r)

## 5.2 모델 선택 및 튜닝

In [None]:
sorted_result= sorted(results, key=lambda result: result.cost)   
for r in sorted_result:
    print(r)

# 6. Evaluation

In [None]:
best_model = sorted_result[0].model
y_pred = best_model.predict(test_data)
y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).astype(int)

In [None]:
#Save Submission
submission = test_data.copy()
submission["datetime"] = test_data.index
submission["count"] = y_pred.astype(int)
submission = submission[["datetime", "count"]]
submission.to_csv('submission.csv', index=False)
