In [None]:
#Importing require libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

#Setting Format
pd.options.display.float_format = '{:.5f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None
np.random.seed(100)

In [None]:
# Importing our data

day = pd.read_csv("../input/bike-sharing-dataset/day.csv")
hour = pd.read_csv("../input/bike-sharing-dataset/hour.csv")

In [None]:
day.head()

In [None]:
hour.head()

In [None]:
hour.info()

In [None]:
#Description of our data

with open('../input/bike-sharing-dataset/Readme.txt', 'r') as txt:
    print(txt.read())

In [None]:
# As per given information these data was transformed
# So lets do transfrom them back to their real format to get better understanding of data

day['temp'] = day['temp']*41
hour['temp'] = hour['temp']*41

day['atemp'] = day['atemp']*50
hour['atemp'] = hour['atemp']*50

day['hum'] = day['hum']*100
hour['hum'] = hour['hum']*100

day['windspeed'] = day['windspeed']*67
hour['windspeed'] = hour['windspeed']*67

In [None]:
day.head()

In [None]:
hour.info() #Checking data type

In [None]:
day.isna().sum() # is their any Null vales?

In [None]:
hour.isna().sum()# is their any Null vales?

In [None]:
hour.info()

In [None]:
day.describe().T #Lets look at Mean, median and Standard Deviation

In [None]:
# These columns should be Category not int

col = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit']

In [None]:
def change_dtype(data, col):
    for i in col:
        if i in data.columns.to_list():
            data[i] = data[i].astype('category')

In [None]:
for i in col:
    print("Name of {} col".format(i)) #Name of Col
    print("No. of NUnique", hour[i].nunique()) #Total Nunique Values
    print("Unique Values", hour[i].unique())# All unique vales
    print('*'*30) # to make differnce i each col
    print()
    print()

In [None]:
change_dtype(day, col) #Changing Col
change_dtype(hour, col) #Changing Col

In [None]:
#How they look after transformation

for i in col:
    print("Name of {} col".format(i)) #Name of Col
    print("No. of NUnique", hour[i].nunique()) #Total Nunique Values
    print("Unique Values", hour[i].unique())# All unique vales
    print('*'*30) # to make differnce i each col
    print()
    print()

In [None]:
def drop_instant(data):
    data.drop(['instant'], axis=1, inplace=True)
    
drop_instant(day)
drop_instant(hour)

In [None]:
day.describe().T

mean and median in "**Day**" data is approxminately nearby except in case of "**Casual**"   

In [None]:
hour.describe().T

mean and median in "**hour**" data is approxminately nearby except in case of "**Casual, registered and cnt**"   

## Lets do some Visualization

In [None]:
for i in day.select_dtypes(include='int'):
    sns.distplot(day[i]) #Lets check how data is distributed
    plt.show() 

In [None]:
for i in hour.select_dtypes(include='int'):
    sns.distplot(hour[i]) #Lets check how data is distributed
    plt.show()

In [None]:
for i in day.select_dtypes(include='int'):
    sns.boxplot(day[i]) #Is their any outlier
    plt.show()

In [None]:
for i in hour.select_dtypes(include='int'):
    sns.boxplot(hour[i]) #Is their any outlier
    plt.show()

Casual, Registered and cnt have outlier and need to be fix

Point to be noted that cnt is sum total of casual and Registered.

In [None]:
for i in day.select_dtypes(include='float'):
    sns.distplot(day[i])
    plt.show()

In [None]:
for i in hour.select_dtypes(include='float'):
    sns.distplot(hour[i])
    plt.show()

In [None]:
for i in day.select_dtypes(include='float'):
    sns.boxplot(day[i])
    plt.show()

In [None]:
for i in hour.select_dtypes(include='float'):
    sns.boxplot(hour[i])
    plt.show()

In [None]:
sns.heatmap(day.corr()) #How are data is related to each other

In [None]:
day.corr()['cnt'] #Co-relation with Tagret Variable

In [None]:
hour.corr()['cnt'] #Co-relation with Tagret Variable

In [None]:
day.head()

In [None]:
def get_df_name(df):
    '''
    This Function returns the name of a dateset
    '''
    name =[x for x in globals() if globals()[x] is df][0]
    return name


def plot_stack_bar_chart(data, col, name):
    plt.figure(figsize=(12,8)) #Size of a PLot
    p1 = plt.bar(data[col].unique(),  # the x locations for the groups
                data.groupby([col])['casual'].sum()) # Count of casual per season

    p2 = plt.bar(data[col].unique(),  # the x locations for the groups
                data.groupby([col])['registered'].sum(), # Count of Registered per season
                 bottom = data.groupby([col])['casual'].sum()) # Count of casual per season

    plt.ylabel('Count')
    plt.title("Count by Casual and Registered for each {} in {} Data".format(col, get_df_name(data)))
    plt.xticks(data[col].unique(), name) # Name of unique values in columns
    plt.legend((p1[0], p2[0]), ('Casual', 'Registered')) #setting legends as per target
    plt.show()

# Let's Visualize and understand **Day** Data

In [None]:
plot_stack_bar_chart(day, 'season', ('1:springer', '2:summer', '3:fall', '4:winter'))

In [None]:
plot_stack_bar_chart(day, 'yr', ('2011', '2012'))

In [None]:
plot_stack_bar_chart(day, 'mnth', [str(i) for i in day['mnth'].unique()])

In [None]:
plot_stack_bar_chart(day, 'holiday', ('Yes', 'No'))

In [None]:
plot_stack_bar_chart(day, 'weekday', [str(i) for i in day['weekday'].unique()])

In [None]:
plot_stack_bar_chart(day, 'workingday', ('Yes', 'No'))

In [None]:
plot_stack_bar_chart(day, 'weathersit', ('Clear', 'Mist', 'Light Snow', 'Rain'))

# Let's Visualize and understand Hour Data

In [None]:
plot_stack_bar_chart(hour, 'season', ('1:springer', '2:summer', '3:fall', '4:winter'))

In [None]:
plot_stack_bar_chart(hour, 'yr', ('2011', '2012'))

In [None]:
plot_stack_bar_chart(hour, 'mnth', [str(i) for i in hour['mnth'].unique()])

In [None]:
plot_stack_bar_chart(hour, 'hr', [str(i) for i in hour['hr'].unique()])

In [None]:
plot_stack_bar_chart(hour, 'holiday', ('Yes', 'No'))

In [None]:
plot_stack_bar_chart(hour, 'weekday', [str(i) for i in hour['weekday'].unique()])

In [None]:
plot_stack_bar_chart(hour, 'workingday', ('Yes', 'No'))

In [None]:
plot_stack_bar_chart(hour, 'weathersit', ('Clear', 'Mist', 'Light Snow', 'Rain'))

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x = hour['holiday'], y = hour['cnt'],hue = hour['season'])
plt.title('Holiday wise distribution of counts')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x = hour['workingday'], y = hour['cnt'],hue = hour['season'])
plt.title('Working Day wise distribution of counts')
plt.show()

In [None]:
plt.figure(figsize=(18,10))
sns.barplot(x = hour['mnth'], y = hour['cnt'], hue = hour['season'])
plt.title('Month wise distribution of counts')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x = hour['weathersit'], y = hour['cnt'],hue = hour['season'])
plt.title('Weather Situation wise distribution of counts')
plt.show()

# Finding and Replacing outliers

In [None]:
sns.boxplot(hour['hum'])

In [None]:
sns.boxplot(hour['windspeed'])

In [None]:
hour.describe(include='all').T

In [None]:
def treat_outlier_iqr(data, col):
    
    #Finding 25 and 75 Quantile
    q25, q75 = np.percentile(data[col], 25), np.percentile(data[col], 75)
    # Inter Quantile Range
    iqr = q75-q25
    #Minimum and Maximum Range
    min_r, max_r = q25-(iqr*1.5), q75+(iqr*1.5)
    #Replacing Outliers with Mean
    data.loc[data.loc[:, col] < min_r, col] = data[col].mean()
    data.loc[data.loc[:, col] > max_r, col] = data[col].mean()
    
    return sns.boxplot(data[col])

In [None]:
treat_outlier_iqr(hour, 'hum') # Treating Outliers in Hum Column

In [None]:
treat_outlier_iqr(hour, 'windspeed') # Treating Outliers in Hum Column

In [None]:
hour.head()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(hour.corr())

From above we can determine that

temp and atemp are highly Correlated

casual, Registered and Cnt are highly correlated as well

In [None]:
df = hour.copy()

In [None]:
hour.info()

In [None]:
hour.describe(include='all').T

## Checking Multi - Collinearity 

In [None]:
y = hour['cnt']
x = hour.drop(['cnt', 'dteday'], axis=1) # removing cnt and dteday

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

#Adding Constant to data
X = add_constant(x)

# Checking ViF for Multi-Collinearity

pd.Series([variance_inflation_factor(X.values, i) 
           for i in range(X.shape[1])], 
              index=X.columns)

# So Temp and atemp is showing Multi-Collinarity

In [None]:
def feature_eng(data, col):
    data['temp_and_atemp'] = (data['temp'] + data['atemp'])/2 #Average the column to remove multicollinearity
    data['dteday'] = data['dteday'].astype('datetime64') # Converting column to datetime64
    data['day'] = data['dteday'].astype('datetime64').dt.day # Extrating day from date
    data['day'] = data['day'].astype('category') #Converting day to category
    data.drop(['casual', col, 'dteday', 'temp', 'atemp'], axis=1, inplace=True) #Droping all the irrelevant column

#Transforming Hour Data
feature_eng(hour, 'registered')

In [None]:
#Checking to see if their is still any multicollinearity

y = hour['cnt']
x = hour.drop(['cnt'], axis=1)

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(x)

pd.Series([variance_inflation_factor(X.values, i) 
           for i in range(X.shape[1])], 
              index=X.columns)

In [None]:
hour.info()

In [None]:
x.head()

In [None]:
x.describe(include='all').T

In [None]:
hour.info()

In [None]:
# Getting all category columns to list
cat_col = hour.select_dtypes(include='category').columns.to_list()

#Printing all unique values in Category Columnns
for i in cat_col:
    print("Name of {} col".format(i))
    print("No. of NUnique", hour[i].nunique())
    print("Unique Values", hour[i].unique())
    print('*'*30)

In [None]:
def Categorical_transformation(data):
    '''
    Transforming all Categorical Columns to int
    '''
    cat = data.select_dtypes(include='category').columns.to_list()
    for i in cat:
        data[i] = data[i].astype('int64')
    return "Successful"

Categorical_transformation(hour)

In [None]:
hour.info()

In [None]:
y = hour['cnt']
x = hour.drop(['cnt'], axis=1)

In [None]:
#for Spliting Data and Hyperparameter Tuning 
from sklearn.model_selection import train_test_split, GridSearchCV

#Importing Machine Learning Model
from catboost import CatBoostRegressor
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

#statistical Tools
from sklearn import metrics

#To tranform data
from sklearn import preprocessing

In [None]:
# Spliting data into Training and Testing

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=10)

In [None]:
accuracy = {}
rmse = {}
explained_variance = {}
max_error = {}
MAE = {}

def train_model(model, model_name):
    print(model_name) # Printing model name
    model.fit(x_train,y_train) # fitting the defined model
    pred = model.predict(x_test) # predicting our data

    acc = metrics.r2_score(y_test, pred)*100 #Checking R2_Score
    accuracy[model_name] = acc # Saving R2_Score to dict.
    print('R2_Score',acc)

    met = np.sqrt(metrics.mean_squared_error(y_test, pred)) #Calculating RMSE
    print('RMSE : ', met) 
    rmse[model_name] = met #Saving RMSE

    var = (metrics.explained_variance_score(y_test, pred)) #Calculating explained_variance_score
    print('Explained_Variance : ', var)
    explained_variance[model_name] = var #Saving explained_variance_score

    error = (metrics.max_error(y_test, pred)) #Calculating Max_Error
    print('Max_Error : ', error)
    max_error[model_name] = error #Saving Max_Error
    
    err = metrics.mean_absolute_error(y_test, pred) #Calculating mean_absolute_error
    print("Mean Absolute Error", err)
    MAE[model_name] = err #Saving mean_absolute_error

In [None]:
xgb = XGBRegressor(n_jobs = 4, n_estimators = x.shape[0], max_depth = 5)

#Training Model
train_model(xgb, "Xtreme Gradient")

In [None]:
#Training Model
gbr = ensemble.GradientBoostingRegressor(learning_rate=0.01, n_estimators=1000, 
                                         max_depth=5, min_samples_split=8) # Gradient Boosting Model

train_model(gbr, "Gradient Boost")

In [None]:
#Training Model
cat = CatBoostRegressor(verbose=0, n_estimators = x_train.shape[0]) #Cat Booting Regression model

train_model(cat, "Cat Boost")

In [None]:
lgbr = LGBMRegressor(n_estimators = x_train.shape[0], learning_rate=0.01, max_depth=12, 
                     objective='tweedie', num_leaves=15, n_jobs = 4) #Light Gradient Boosting Model

#Training Model
train_model(lgbr, 'Light Gradient Boost')

In [None]:
#Training Model
lr = LinearRegression(normalize = True, n_jobs=4) #Linear Regression Model

train_model(lr, "Linear Regression")

In [None]:
#Training Model
rfc = ensemble.RandomForestRegressor(n_estimators=1000, bootstrap=True, min_samples_leaf=100, 
                                     n_jobs=-1, min_samples_split=8, max_depth=6) #Random Forest Bagging Model

train_model(rfc, "Random Forest")

In [None]:
#Training Model
ada = ensemble.AdaBoostRegressor(n_estimators=1000, learning_rate=0.01) #Adaptive Boosting Bodel

train_model(ada, "Ada Boost")

In [None]:
#Training Model
dtr = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100) # Decision tree model

train_model(dtr, "Decision Tree")

In [None]:
#Training Model
mlp = MLPRegressor(hidden_layer_sizes=(200,2), learning_rate='adaptive', max_iter=400) #Multi-Layer Percepton Regression model

train_model(mlp, "Multi-layer Perceptron")

In [None]:
#Training Model
knn = KNeighborsRegressor(n_neighbors=10, n_jobs=4, leaf_size=50) # K Nearest Neighbors Regressor model

train_model(knn, "K Nearest Neighbors")

In [None]:
# Training model using Deep Learning Keras Library

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras import layers
from keras.layers import Dense

model = keras.Sequential([
    layers.Dense(256, activation = tf.keras.layers.ELU(), input_shape=[x_train.shape[1]]), #Input Layer
    layers.Dense(256, activation=tf.keras.layers.ELU()), #Hidden Layer
    layers.Dense(16, activation = 'relu'), #Hidden Layer
    layers.Dense(4, activation = 'relu'), #Hidden Layer
    layers.Dense(1) #Output Layer
   ])

# Compile the network :
model.compile(loss = tf.keras.losses.MeanSquaredError(), 
                 optimizer = 'adam', metrics = tf.keras.metrics.RootMeanSquaredError())
model.summary()

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
z = scaler.fit_transform(x)

In [None]:
#Creating check point to retreive best weights
from keras.callbacks import ModelCheckpoint

checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

# Fitting the model
history = model.fit(x, y, epochs=200, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

In [None]:
#Creating Dataframe to check history
history_df = pd.DataFrame(history.history)

history_df.loc[:, ['loss', 'val_loss']].plot(title="Loss")
history_df.loc[:, ['root_mean_squared_error', 'val_root_mean_squared_error']].plot(title="Root Mean Square Error")

In [None]:
# Load wights file of the best model :
wights_file = './Weights-021--5126.26074.hdf5' # choose the best checkpoint 

model.load_weights(wights_file) # load it
model.compile(loss = tf.keras.losses.MeanSquaredError(), 
                 optimizer = 'adam', metrics = tf.keras.metrics.RootMeanSquaredError())

In [None]:
train_model(model, "NN Model")

# Light Gradient Boosting Model is giving the Best Result Lowest Mean Squared Error