In [None]:
import os
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data Fields

## Time series
**datetime** - hourly date + timestamp  

## Categorical
**season** -  1 = spring, 2 = summer, 3 = fall, 4 = winter  
**holiday** - whether the day is considered a holiday  
**workingday** - whether the day is neither a weekend nor holiday  
**weather** -  
1: Clear, Few clouds, Partly cloudy, Partly cloudy  
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist  
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds  
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog  

## Numeric
**temp** - temperature in Celsius  
**atemp** - "feels like" temperature in Celsius  
**humidity** - relative humidity  
**windspeed** - wind speed  
**casual** - number of non-registered user rentals initiated  
**registered** - number of registered user rentals initiated  
**count** - number of total rentals (casual + registered)

In [None]:
#Importing the dataset
train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

# Visualize dataset
  * Boxplot and histogram for 'count' column.
  * Scatter the numeric columns with 'count' column.
  * Boxplot the categorical columns with 'count' column.
  * Correlation between each features.

In [None]:
# Boxplot of count
sns.boxplot(x='count', data=train, color='mediumpurple')
plt.show()

In [None]:
# Histogram of count (It looks skew..)
sns.set_style('darkgrid')
sns.distplot(train['count'], bins=100, color='green')
plt.show()

In [None]:
# Scatter plot between count & each numeric features
fields = [f for f in train]
fields = fields[5:-3]
print(fields)

fig = plt.figure(figsize=(17, 3))

for i, f in enumerate(fields):
    ax = fig.add_subplot(1, 4, i+1)
    ax.scatter(train[f], train['count'])
    ax.set_ylabel('count')
    ax.set_xlabel(f)

plt.show()

# 'temp' and 'atemp' looks similar..

In [None]:
# Boxplot between count & each categorical features
fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(20, 10)
sns.boxplot(data=train, y="count", x="season", ax=axes[0][0])
sns.boxplot(data=train, y="count", x="holiday", ax=axes[0][1])
sns.boxplot(data=train, y="count", x="workingday", ax=axes[1][0])
sns.boxplot(data=train, y="count", x="weather", ax=axes[1][1])

axes[0][0].set(xlabel='Season', ylabel="Count")
axes[0][1].set(xlabel='Holiday', ylabel='Count')
axes[1][0].set(xlabel='Workingday', ylabel='Count')
axes[1][1].set(xlabel='Weather', ylabel='Count')

In [None]:
# Correlation between each features
plt.figure(figsize=(10,10))
sns.heatmap(train.corr("pearson"),
            vmin=-1, vmax=1,
            cmap='coolwarm',
            annot=True, 
            square=True)

# Data preprocessing
* What we have to do
  - Split 'datetime' column with converting datetime object.
  - Take log for count column to mitigate skew.
  - Eliminate outliers.
  - MinMax scaling the numeric columns.
  - Split data & train set

In [None]:
# Convert datetime column to each elements (year, month, day, dayofweek, hour)
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])
train.head()

In [None]:
def split_datetime(df):
    df['year'] = df['datetime'].apply(lambda t: t.year)
    df['month'] = df['datetime'].apply(lambda t: t.month)
    df['day'] = df['datetime'].apply(lambda t: t.day)
    df['dayofweek'] = df['datetime'].apply(lambda t: t.dayofweek)
    df['hour'] = df['datetime'].apply(lambda t: t.hour)
    df = df.drop(['datetime'], axis=1)
    return df

In [None]:
test = split_datetime(test)
train = split_datetime(train)
train = train.drop(['casual', 'registered'], axis=1)
train.head()

In [None]:
# Boxplot between count & each categorical features
fig, axes = plt.subplots(nrows=1,ncols=3)
fig.set_size_inches(25, 5)
sns.barplot(data=train, x='year', y=train['count'], ax=axes[0])
sns.barplot(data=train, x='month', y=train['count'], ax=axes[1])
sns.pointplot(data=train, x='hour', y=train['count'], ax=axes[2], hue='dayofweek')

In [None]:
# Count column looks skew.
sns.distplot(train['count'])

In [None]:
# Take a log for count column
train['count'] = np.log1p(train['count'])

In [None]:
sns.distplot(train['count'])

In [None]:
# Eliminate outliers (with residual less than stdev*3)
train = train[np.abs(train['count']-train['count'].mean()) <= (3*train['count'].std())]

In [None]:
# Boxplot of count
sns.boxplot(x='count', data=train, color='mediumpurple')
plt.show()

In [None]:
# Eliminate outliers (between correlation)
fig = plt.figure(figsize=(15, 15))
for i, f1 in enumerate(fields):
    for j, f2 in enumerate(fields):
        idx = i*len(fields)+j+1
        ax = fig.add_subplot(len(fields), len(fields), idx)
        ax.scatter(train[f1], train[f2])
        ax.set_ylabel(f1)
        ax.set_xlabel(f2)
plt.show()

In [None]:
drop_idx = train[(train['atemp'] > 20) & (train['atemp'] < 40) & (train['temp'] > 10) & (train['temp'] < 20)].index
train = train.drop(drop_idx)

In [None]:
# Standard Scaling numeric columns
from sklearn.preprocessing import MinMaxScaler

def scaling(df):
    scaler = MinMaxScaler()
    num_cols = ['temp', 'atemp', 'humidity', 'windspeed']
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

In [None]:
train = scaling(train)
test = scaling(test)

In [None]:
train.head()

In [None]:
# Split train & test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop(['count'], axis=1), train['count'], test_size=0.3)

# Define metric (RMSLE)

In [None]:
"""
np.log1p(x) : log0은 무한대로 발산하기 때문에 x값에다가 1을 더한다.
np.expm1() : 1빼줘야 원래대로 환원
"""
def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

# Model selection

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import GridSearchCV

In [None]:
def evaluate(reg_cls, params=None):
    reg = reg_cls()
    if params:
        reg = GridSearchCV(reg, param_grid=params, refit=True)
    reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    
    y_test_exp = np.expm1(y_test)
    pred_exp = np.expm1(pred)
    print('\n', reg_cls)
    if params:
        print(reg.best_params_)
        reg = reg.best_estimator_
    print(rmsle(y_test_exp, pred_exp))
    return reg, pred_exp

In [None]:
lr_reg, pred_lr = evaluate(LinearRegression)
rg_reg, pred_rg = evaluate(Ridge)
ls_reg, pred_ls = evaluate(Lasso)
rf_reg, pred_rf = evaluate(RandomForestRegressor)
gb_reg, pred_gb = evaluate(GradientBoostingRegressor)
xg_reg, pred_xg = evaluate(XGBRegressor)
lg_reg, pred_lg = evaluate(LGBMRegressor)

params = {'n_estimators': [100*i for i in range(1, 6)]}
xg_reg, pred_xg = evaluate(XGBRegressor, params)
lg_reg, pred_lg = evaluate(LGBMRegressor, params)

In [None]:
def feature_importances(reg):
    plt.figure(figsize=(20, 10))
    print(type(reg))
    df = pd.DataFrame(sorted(zip(X_train.columns, reg.feature_importances_)), columns=['Feature', 'Value'])
    sns.barplot(x="Value", y="Feature", data=df.sort_values(by="Value", ascending=False))
    plt.show()

In [None]:
feature_importances(xg_reg)

In [None]:
# LightGBMRegressor feature importances
feature_importances(lg_reg)

# Submission

In [None]:
submission = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv')
submission

In [None]:
test.shape

In [None]:
# pred = xg_reg.predict(test)
pred = lg_reg.predict(test)
pred_exp = np.expm1(pred)
print(pred_exp)

In [None]:
submission.loc[:, 'count'] = pred_exp
submission

In [None]:
submission.to_csv('submission.csv', index=False)