In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
sns.set(color_codes=True)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.inspection import permutation_importance

import warnings
warnings.simplefilter('ignore')

## Loar and Explore Dataset

In [None]:
# Load dataset
df = pd.read_csv("/kaggle/input/bike-sharing-system-washington-dc/train_bikes.csv")
print("Data Shape: ", df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### Univariate Analysis

In [None]:
# Visualize the frequency of categorical values
fig, ax = plt.subplots(nrows=1, ncols=4, sharey=True, figsize=(14,4))
sns.countplot(x='season', data=df, palette='winter', ax=ax[0])
sns.countplot(x='holiday', data=df, palette='winter', ax=ax[1])
sns.countplot(x='workingday', data=df, palette='winter', ax=ax[2])
sns.countplot(x='weather', data=df, palette='winter', ax=ax[3])
plt.tight_layout() 
plt.show()

In [None]:
# Visualize the probability density of continuous variables
fig, ax = plt.subplots(nrows=2, ncols=4, sharey=True, figsize=(14,6))
sns.distplot(df['temp'], color='royalblue', ax=ax[0][0])
sns.distplot(df['atemp'], color='royalblue', ax=ax[0][1])
sns.distplot(df['windspeed'], color='royalblue', ax=ax[0][2])
sns.distplot(df['humidity'], color='royalblue', ax=ax[0][3])
sns.distplot(df['casual'], color='royalblue', ax=ax[1][0])
sns.distplot(df['registered'], color='royalblue', ax=ax[1][1])
sns.distplot(df['count'], color='royalblue', ax=ax[1][2])
plt.tight_layout() 
plt.show()

### Segmented Univariate Analysis

In [None]:
# Visualize bike counts by each categorical variable
fig, ax = plt.subplots(nrows=1, ncols=4, sharey=True, figsize=(16,4))
sns.boxenplot(x='season', y='count', data=df, palette='winter', linewidth=0.1, ax=ax[0])
sns.boxenplot(x='holiday', y='count', data=df, palette='winter', linewidth=0.1, ax=ax[1])
sns.boxenplot(x='workingday', y='count', data=df, palette='winter', linewidth=0.1, ax=ax[2])
sns.boxenplot(x='weather', y='count', data=df, palette='winter', linewidth=0.1, ax=ax[3])
plt.tight_layout() 
plt.show()

### Bivariate Analysis

In [None]:
# Visualize the relationship between the continuous variables and bike counts
fig, ax = plt.subplots(nrows=2, ncols=3, sharey=True, figsize=(16,8))
sns.scatterplot(x='temp', y='count', data=df, color='royalblue', ax=ax[0][0])
sns.scatterplot(x='atemp', y='count', data=df, color='royalblue', ax=ax[0][1])
sns.scatterplot(x='windspeed', y='count', data=df, color='royalblue', ax=ax[0][2])
sns.scatterplot(x='humidity', y='count', data=df, color='royalblue', ax=ax[1][0])
sns.scatterplot(x='casual', y='count', data=df, color='royalblue', ax=ax[1][1])
sns.scatterplot(x='registered', y='count', data=df, color='royalblue', ax=ax[1][2])
plt.tight_layout() 
plt.show()

## Data Preprocessing

In [None]:
# Get dummies for categorical variables
df_dummies = pd.get_dummies(df, drop_first=True, columns=['season','holiday','workingday','weather'])
# Drop variables without useful information
df_dummies = df_dummies.drop(['datetime','casual', 'registered'], axis=1)
df_dummies.head()

In [None]:
# Visualize the correlationship between variables

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 8))

# Draw the heatmap with correct aspect ratio
sns.heatmap(df_dummies.corr(), cmap="coolwarm", vmax=.3, center=0, square=True, linewidths=2.5, 
            cbar_kws={"shrink": .5}, annot=False, fmt="1.1f")

In [None]:
# split into input X variables and output y variable
X = df_dummies.drop(['count'], axis=1)
y = df_dummies['count']

In [None]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
# Standardize X

cols = X.columns

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.columns = cols
X_test.columns = cols

## Regression by Deep Neural Network

### Define and Train DNN model

In [None]:
def reg_model():
    model = Sequential()
    model.add(Dense(20, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
# Initiate DNN
dnn = KerasRegressor(build_fn=reg_model, epochs=5000, batch_size=20, verbose=1)

# Fit DNN
dnn_history = dnn.fit(X_train, y_train)

In [None]:
# Visualize the DNN learning
loss_train = dnn_history.history['loss']
epochs = range(1,5001)
plt.figure(figsize=(8,6))
plt.plot(epochs, loss_train, 'royalblue', label='Training loss', linewidth=3)
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Model Evaluation

In [None]:
# Make predictions
dnn_y_pred = dnn.predict(X_test)

# Performance metrics
dnn_r2 = r2_score(y_test, dnn_y_pred)
dnn_mae = mean_absolute_error(y_test, dnn_y_pred)
dnn_mse = mean_squared_error(y_test, dnn_y_pred)

# Show the model performance
print("DNN R2: ", dnn_r2)
print("DNN MAE: ", dnn_mae)
print("DNN MSE: ", dnn_mse)

### Feature Importance

In [None]:
# perform permutation importance
results = permutation_importance(dnn, X, y, scoring='neg_mean_squared_error')
# get importance
importance = results.importances_mean

# Plot feature importance
df_importance = pd.DataFrame(importance)
df_importance.columns = ['feature_importance']
df_importance['features'] = X.columns
sns.barplot(x='feature_importance', y='features', data=df_importance)