In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Overview of Dataset

In [None]:
train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
train.head()

In [None]:
test.head()

Test set does not have the target feature count as well as the casual and registered columns, which represent whether the person who borrowed bike is registered user or not. 

In [None]:
print('Train shape:', train.shape)
print('Test shape:', test.shape)

In [None]:
train.info()

In [None]:
test.info()

# Data Manipulation

In [None]:
train['season'] = train['season'].map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})

Changed season column so that it is understandable better.

And I did the same thing for weather column below.

In [None]:
train['weather'] = train['weather'].map({1:'clear', 2:'cloudy', 3:'drizzle', 4:'rainstorm'})

In [None]:
test['season'] = test['season'].map({1:'spring', 2:'summer', 3:'fall', 4:'winter'})
test['weather'] = test['weather'].map({1:'clear', 2:'cloudy', 3:'drizzle', 4:'rainstorm'})

In [None]:
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

I converted datetime column from string to datetime type so we can retrieve more information from it such as month and day to do more analysis.

In [None]:
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour

test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour

# Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Count for season, holiday, workingday, and weather

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 10))

sns.countplot(x='season', data=train, ax=ax[0,0]);
sns.countplot(x='holiday', data=train, ax=ax[0,1]);
sns.countplot(x='workingday', data=train, ax=ax[1,0]);
sns.countplot(x='weather', data=train, ax=ax[1,1]);

### Distribution of temp, atemp, humidity, and windspeed

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 10))

sns.histplot(train['temp'], ax=ax[0,0]);
sns.histplot(train['atemp'], ax=ax[0,1]);
sns.histplot(train['humidity'], ax=ax[1,0], bins=30);
sns.histplot(train['windspeed'], ax=ax[1,1], bins=20);

### Distribution for number of rented bikes

In [None]:
plt.figure(figsize=(12,6));

sns.histplot(train['count'], bins=40);
plt.xticks(range(0, 1001, 100));
plt.xlabel('Number of Rented Bikes');

In [None]:
sns.boxplot(data=train, x='count', orient='v');

### Box plots for count in different features

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 12))

sns.boxplot(data=train, y='count', x='season', orient='v', ax=ax[0,0]);
sns.boxplot(data=train, y='count', x='holiday', orient='v', ax=ax[0,1]);
sns.boxplot(data=train, y='count', x='workingday', orient='v', ax=ax[1,0]);
sns.boxplot(data=train, y='count', x='weather', orient='v', ax=ax[1,1]);

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(10, 15))

sns.boxplot(data=train, y='count', x='month', orient='v', ax=ax[0]);
sns.boxplot(data=train, y='count', x='day', orient='v', ax=ax[1]);
sns.boxplot(data=train, y='count', x='hour', orient='v', ax=ax[2]);
sns.lineplot(data=train, y=train.groupby(['hour'])['count'].mean(), x='hour', ax=ax[2], color='r');

### Changes of count across time

In [None]:
hour_transformed = pd.melt(train[["hour","casual","registered"]], id_vars=['hour'], value_vars=['casual', 'registered'])
hour_aggregated = hour_transformed.groupby(['hour', 'variable']).mean('value').reset_index()

plt.figure(figsize=(10, 6))
sns.pointplot(data=hour_aggregated, y='value', x='hour', hue='variable');

In [None]:
season_avg = pd.DataFrame(train.groupby(['hour', 'season'])['count'].mean()).reset_index()

plt.figure(figsize=(10, 6))
sns.pointplot(data=season_avg, x='hour', y='count', hue='season');

In [None]:
month_avg = pd.DataFrame(train.groupby(['month'])['count'].mean()).reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=month_avg, x='month', y='count');

In [None]:
workday_avg = pd.DataFrame(train.groupby(['hour', 'workingday'])['count'].mean()).reset_index()

plt.figure(figsize=(10, 6))
sns.pointplot(data=workday_avg, x='hour', y='count', hue='workingday');

# Model Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
train.head()

In [None]:
# get rid of some unused columns
X = train.drop(['datetime', 'casual', 'registered', 'count', 'day'], axis=1)
y = train['count']
test_df = test.drop(['datetime', 'day'], axis=1)

I think it is a good idea to transform month and hour to categorical data and make dummy variables of them.

In [None]:
X['month'] = X['month'].astype('category')
X['hour'] = X['hour'].astype('category')
test_df['month'] = test_df['month'].astype('category')
test_df['hour'] = test_df['hour'].astype('category')
X.head()

In [None]:
X = pd.get_dummies(X, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)
X.head()

In [None]:
y_log = np.log1p(y)

I made y in logarithm so it is more center.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,5));

sns.histplot(y, ax=ax[0]);
ax[0].set_title('Before log');

sns.histplot(y_log, ax=ax[1]);
ax[1].set_title('After log');

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.3, random_state=87)

In [None]:
from sklearn.metrics import mean_squared_log_error

def RMSLE(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df)

### Linear Regression

In [None]:
lr = LinearRegression().fit(X_train, y_train)

train_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

train_score = RMSLE(np.exp(train_pred), np.exp(y_train))
val_score = RMSLE(np.exp(val_pred).reshape(-1,1), np.exp(y_val))

print('Train score:', train_score)
print('validation score:', val_score)

### Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [20, 40, 60, 80],
          'n_estimators': [200, 400, 600]}

clf = GridSearchCV(RandomForestRegressor(random_state=87), params, n_jobs=-1, verbose=1)
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
rf = RandomForestRegressor(max_depth=40, n_estimators=600, random_state=87).fit(X_train, y_train)

train_pred = rf.predict(X_train)
val_pred = rf.predict(X_val)

train_score = RMSLE(np.exp(train_pred), np.exp(y_train))
val_score = RMSLE(np.exp(val_pred), np.exp(y_val))

print('Train score:', train_score)
print('validation score:', val_score)

### Dense Layers

In [None]:
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Model

In [None]:
i = Input(shape=(X_train.shape[1],))
x = Dense(1024, activation='relu')(i)
x = Dense(512, activation='relu')(x)
x = Dense(256, activation='relu')(x)
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(1)(x)

model = Model(inputs=i, outputs=x)

In [None]:
from tensorflow.keras.optimizers import Adam

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50)

In [None]:
RMSLE(np.exp(model.predict(X_train).reshape(1, -1)[0]), np.exp(y_train))

In [None]:
RMSLE(np.exp(model.predict(X_val).reshape(1, -1)[0]), np.exp(y_val))

In [None]:
ann_pred = np.exp(model.predict(test_scaled).reshape(1,-1)[0])

In [None]:
test_datetime = test['datetime']

In [None]:
submission = pd.DataFrame({'datetime': test_datetime, 'count': ann_pred})

In [None]:
submission.to_csv('bike_ann_predictions.csv', index=False)