<div class="alert alert-dark" role="alert">
    <h1 align = 'center'>Bike Sharing Demand</h1>
    <h2 align = 'center'>predict bike sharing demand with machine learning models</h2>
    <hr>
</div>

# 1: Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# 2: Load dateset

In [None]:
df = pd.read_csv('../input/bike-sharing-demand/train.csv')
unseen_data = pd.read_csv('../input/bike-sharing-demand/test.csv')
df.head(5)

# 3: EDA

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.duplicated().sum()

<div class="alert alert-light" role="alert">
    <p align = 'center'>no missing value or duplicated value found</p>
</div>

In [None]:
df.columns  

In [None]:
df.dtypes

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])
type(df['datetime'][0])

In [None]:
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day_of_month'] = df['datetime'].dt.day
df['day_of_week'] = df['datetime'].dt.day_of_week
df['hour'] = df['datetime'].dt.hour
df.drop(['datetime', 'casual','registered'], axis = 1, inplace = True)

df.dtypes

In [None]:
df.nunique()

In [None]:
cont_columns = ['temp', 'atemp', 'humidity', 'windspeed', 'count']
df[cont_columns].describe()

In [None]:
df.groupby('season').sum()['count'].to_frame()

<div class="alert alert-light" role="alert">
    <p align = 'center'>most bikes are used in fall.</p>
</div>

In [None]:
pd.crosstab(df['season'], df['weather'], normalize = 0)

<div class="alert alert-light" role="alert">
    <p align = 'center'>As we expected, more cyclists use bicycles in weather 1 (Clear, Few clouds, Partly cloudy, Partly cloudy ).</p>
</div>

In [None]:
df['workingday'].value_counts()

In [None]:
corr = df.corr() 
corr

In [None]:
corr['count'].sort_values()

In [None]:
plt.figure(figsize = (15, 15))
sns.heatmap(corr, annot =True, square= True,cmap = 'GnBu')

# 4: Data preprocessing

#### 4.1: split data to train and test set

In [None]:
x = df.drop('count', axis = 1)
y = df['count']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)
print('x train :', x_train.shape,'\t\tx test :', x_test.shape)
print('y train :', y_train.shape,'\t\ty test :', y_test.shape)

In [None]:
y_test = y_test.values

#### 4.2: feature scaling

In [None]:
mms = MinMaxScaler()
x_train = mms.fit_transform(x_train)
x_test = mms.transform(x_test)

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# 5: Strorytelling - Visualization

In [None]:
plt.figure(figsize= (10, 5))
bp1 = sns.boxplot(y ='count', x = 'day_of_week' ,data = df, palette = 'GnBu_r')
day_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
bp1.set_xticklabels(day_of_week, rotation = 45)
plt.title('boxplot for each day')

In [None]:
plt.figure(figsize= (10, 5))
ax = sns.boxplot(y ='count', x = 'day_of_week', hue = 'workingday' ,data = df, palette = 'GnBu_r')
_ = ax.set_xticklabels(day_of_week, rotation = 45)

In [None]:
cntplt = sns.countplot(data = df, x = 'holiday', palette = 'GnBu' , hue = 'weather')
_ = cntplt.set_xticklabels(labels = ['Yes', 'No'])

In [None]:
palette1 = ['#b5e48c','#52b69a', '#1a759f', '#184e77']
sns.jointplot(data = df[df['year'] == 2011], x = 'temp', y = 'windspeed', hue = 'season', palette = palette1)

In [None]:
count_hour = df.groupby('hour').mean()['count']

plt.figure(figsize = (10, 5))
sns.barplot(x = count_hour.index, y = count_hour.values , palette = 'GnBu')

In [None]:
count_month = df.groupby('month').mean()['count']

plt.figure(figsize = (10, 5))
brplt = sns.barplot(x = count_month.index, y = count_month.values , palette = 'GnBu')
a = brplt.set_xticklabels(labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

In [None]:
count_dayofmonth = df.groupby('day_of_month').mean()['count']
count_dayofmonth
plt.figure(figsize = (10, 5))
brplt = sns.barplot(x = count_dayofmonth.index, y = count_dayofmonth.values , palette = 'GnBu')

# 6: Train your model (Regression)

In [None]:
def drawGraph(ytest, ypred, limit, title):
    plt.figure(figsize= (10, 5))
    plt.plot(y_test[:limit], color = palette1[1])
    plt.plot(y_pred[:limit], color = palette1[2])
    plt.title(title)
    plt.legend(['test set', 'predicted'])

In [None]:
mt = pd.DataFrame(columns = ['model name', 'MAE', 'MSE','R2', 'ME'])

def modelsTabel(modelName, mae, mse, r2s, me):
    mt.loc[len(mt.index)] = [modelName, mae, mse, r2s, me]

#### 6.1: Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

In [None]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2s = metrics.r2_score(y_test, y_pred)
me = metrics.max_error(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R^ 2 Score:', r2s)
print('Max Error:', me)

In [None]:
modelsTabel('Linear Regression', mae, mse, r2s, me)
drawGraph(y_test, y_pred, 100, 'Linear Regression Model')

#### 6.2: Polynomial Regression

In [None]:
poly = PolynomialFeatures(degree = 2)
x_train_qua = poly.fit_transform(x_train)
qua = LinearRegression()
qua.fit(x_train_qua, y_train)
y_pred = qua.predict(poly.fit_transform(x_test))

In [None]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2s = metrics.r2_score(y_test, y_pred)
me = metrics.max_error(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R^ 2 Score:', r2s)
print('Max Error:', me)

In [None]:
modelsTabel('polynomial Regression', mae, mse, r2s, me)
drawGraph(y_test, y_pred, 100, 'polynomial Regression Model')

#### 6.3: KNN

In [None]:
knn = KNeighborsRegressor()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

In [None]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2s = metrics.r2_score(y_test, y_pred)
me = metrics.max_error(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R^ 2 Score:', r2s)
print('Max Error:', me)

In [None]:
error_rate = []
for i in range(1, 15):
    knn = KNeighborsRegressor(n_neighbors = i)
    knn.fit(x_train, y_train)
    y_pred_i = knn.predict(x_test)
    error_rate.append(metrics.r2_score(y_test, y_pred_i))

In [None]:
plt.figure(figsize = (10, 6))
a = plt.plot(range(1, len(error_rate)+1),error_rate ,color = palette1[0], linestyle = 'dashed', marker = 'o',markerfacecolor = palette1[2])

In [None]:
knn = KNeighborsRegressor(n_neighbors = 4)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2s = metrics.r2_score(y_test, y_pred)
me = metrics.max_error(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R^ 2 Score:', r2s)
print('Max Error:', me)

In [None]:
modelsTabel('KNN', mae, mse, r2s, me)
drawGraph(y_test, y_pred, 100, 'KNN')

#### 6.4: Decision Tree

In [None]:
dtr = DecisionTreeRegressor(random_state = 1)
dtr.fit(x_train, y_train)

In [None]:
y_pred = dtr.predict(x_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2s = metrics.r2_score(y_test, y_pred)
me = metrics.max_error(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R^ 2 Score:', r2s)
print('Max Error:', me)

In [None]:
modelsTabel('Decision Tree', mae, mse, r2s, me)
drawGraph(y_test, y_pred, 100, 'Decision Tree')

#### 6.5: Random Forest

In [None]:
rfr = RandomForestRegressor(random_state = 1, n_estimators = 10)
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test)
print('R^ 2 Score:', r2s)

In [None]:
drawGraph(y_test, y_pred, 100, 'Random Forest')

In [None]:
# find best value for n_estimators
max = 0
index = -1
for i in range(10, 60):
    rfr = RandomForestRegressor(random_state = 1, n_estimators = i)
    rfr.fit(x_train, y_train)
    y_pred = rfr.predict(x_test)
    r2_score = metrics.r2_score(y_test, y_pred)
    if r2_score > max:
        index = i
        max = r2_score


In [None]:
rfr = RandomForestRegressor(random_state = 1, n_estimators = index)
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test)

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2s = metrics.r2_score(y_test, y_pred)
me = metrics.max_error(y_test, y_pred)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R^ 2 Score:', r2s)
print('Max Error:', me)

In [None]:
modelsTabel('Random Forest', mae, mse, r2s, me)
drawGraph(y_test, y_pred, 100, 'Random Forest')

In [None]:
mt

# 7: Predice Unseen Data

In [None]:
unseen_data['datetime'] = pd.to_datetime(unseen_data['datetime'])
times = unseen_data['datetime']

In [None]:
unseen_data['year'] = unseen_data['datetime'].dt.year
unseen_data['month'] = unseen_data['datetime'].dt.month
unseen_data['day_of_month'] = unseen_data['datetime'].dt.day
unseen_data['day_of_week'] = unseen_data['datetime'].dt.day_of_week
unseen_data['hour'] = unseen_data['datetime'].dt.hour

unseen_data.drop(['datetime'], axis = 1, inplace = True)
unseen_data.head(3)

In [None]:
unseen_data = mms.transform(unseen_data)
unseen_data = sc.transform(unseen_data)
y_pred_unseen = rfr.predict(unseen_data)

In [None]:
Submission = pd.DataFrame({'datetime' : times, 'count' : y_pred_unseen})
Submission.set_index('datetime', inplace = True)
Submission.to_csv('Submission.csv')