In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Any results you write to the current directory are saved as output.

## **Importing the libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## **Loading the dataset**

In [None]:
df_train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
df_test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
df_train.describe()

## **Checking for the null values**

In [None]:
print(df_train.isnull().sum(axis = 0))
print(df_test.isnull().sum(axis = 0))

### Luckily we do not have to deal with null values

## **Lets first explore the output value (Count)**

In [None]:
fig,ax = plt.subplots(1, 2)
fig.set_size_inches(20,5)
df_train['count'].plot(kind = 'hist', bins=100, ax =ax[0])
df_train['count'].plot(kind = 'box', ax =ax[1])

### As we can see that it contains the outliers. It happens when we have more rented bikes than ususal. So we need to remove the outliers as they can affect our models for predictions.

In [None]:
print('Before removing the outliers ', df_train.shape)
df_train = df_train[abs(df_train['count'] - df_train['count'].mean()) < 3*df_train['count'].std()]
print('After removing the outliers ', df_train.shape)
df_train.reset_index(drop = True, inplace = True)

In [None]:
df_train.head()

### Lets visualize the distribution of the output variable

In [None]:
fig, ax = plt.subplots(1, 3)
fig.set_size_inches(20, 10)
sns.distplot(df_train["count"], ax = ax[0])
sns.distplot(df_train["casual"], ax = ax[1])
sns.distplot(df_train["registered"], ax = ax[2])

### As most of the machine learning models work best when the output variable is normally distributed, we will apply the log transformation to "Count" and "Registered" to make it more normally distributed

In [None]:
df_train['count'] = np.log(df_train['count'] + 1)
df_train['registered'] = np.log(df_train['registered'] + 1)
fig, ax = plt.subplots(1, 3)
fig.set_size_inches(20, 10)
sns.distplot(df_train["count"], ax = ax[0])
sns.distplot(df_train["casual"], ax = ax[1])
sns.distplot(df_train["registered"], ax = ax[2])


### Now the distribution is more normally distributed but remember we have added 1 before taking the log as np.log(0) will give infinity

### Extract the month, hour, year from the datetime column. I am not considering the day as the training set contains only dates from 1 to 19 and test data is of 20th day of each month.

In [None]:
def add_month(dataframe):
    month = pd.DatetimeIndex(dataframe['datetime']).month
    return month

def add_time(dataframe):
    time = pd.DatetimeIndex(dataframe['datetime']).hour
    return time

def add_year(dataframe):
    year = pd.DatetimeIndex(dataframe['datetime']).year
    return year

def add_day(dataframe):
    day = pd.DatetimeIndex(dataframe['datetime']).dayofweek
    return day

In [None]:
df_train['month'] = add_month(df_train)
df_train['time'] = add_time(df_train)
df_train['year'] = add_year(df_train)
df_train['day'] = add_day(df_train)
df_train.head()

### To find the dependence of independent variable on dependent variable we will calculate the correlation between them

In [None]:
corr = df_train.corr()
corr

### We can draw the following obsevations from it:
#### 1. The correlation between temp and atemp is very high so we can discard any of the variable
#### 2. The correlation between count and time is highest and the correlation of count with year, month, temp,humidity is also significant

## Lets explore how the count is varying with different features

### 1. Time 

In [None]:
df_train.groupby('time')['count'].mean().plot(kind = 'bar')

### We can see that the maximum is around 7-9 am and 5-6 pm 

### 2. Month and Season

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(20,5)
df_train.groupby('month')['count'].mean().plot('bar', ax = ax[0])
df_train.groupby('season')['count'].mean().plot('bar', ax = ax[1])

### We can observe from the above graphs that the number of rented bikes are less in season 1 as compared to others

### 3. Temperature and Humidity

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(20,5)
df_train.groupby('temp')['count'].mean().plot('bar', ax = ax[0])
df_train.groupby('humidity')['count'].mean().plot('bar', ax = ax[1])

### We can see that there is a positive relation of temperature with count and a negative relation of humidity with the count, this verifies the correlation values that we got above 

### 4. Holiday amd Workingday

In [None]:
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(20,5)
df_train.groupby('holiday')['count'].mean().plot('bar', ax = ax[0])
df_train.groupby('workingday')['count'].mean().plot('bar', ax = ax[1])
print("mean of count according to holidays ", df_train.groupby('holiday')['count'].mean())
print("No of holiday = 1 and holdays = 0 ", df_train.groupby('holiday')['count'].count())


Three reasons to discard holiday and working day: 
1. Correlation value is low
2. Examples where holiday = 1 is less than 3 percent
3. The mean is almost similar
But we will make different models one with holiday and workingday and one without these features

### 5. Windspeed

In [None]:
df_train[df_train['windspeed'] == 0].shape

### As a lot of values are 0 in windspeed we can consider the following scenerios:
1.  It can actually be 0 at these points.
2.  It is too low to be measured, for example varying from 0 to 5.
3.  All zeros or part of them are nothing but NAs.


## So we will be estimating the missing values using the random forest

In [None]:
df_train.groupby('windspeed')['count'].count().plot(kind='bar')
df_train.groupby('windspeed')['count'].count()

In [None]:
df_train_windspeed_0 = df_train[df_train['windspeed'] == 0]
df_train_windspeed_not_0 = df_train[df_train['windspeed'] != 0]
print(df_train_windspeed_0.head())
print(df_train_windspeed_not_0.head())

In [None]:
print(df_train_windspeed_0.shape)
print(df_train_windspeed_not_0.shape)

In [None]:
columns_for_windspeed = ['holiday', 'season', 'workingday', 'month', 'time', 'year', 'day', 'temp', 'humidity']

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_windspeed = RandomForestRegressor().fit(df_train_windspeed_not_0[columns_for_windspeed], df_train_windspeed_not_0['windspeed'])
df_train_windspeed_0['windspeed'] = rf_windspeed.predict(df_train_windspeed_0[columns_for_windspeed])

df_train = df_train_windspeed_0.append(df_train_windspeed_not_0, sort = 'datetime')

In [None]:
print(df_train.shape)
df_train.head()

In [None]:
print(df_train[df_train['windspeed'] == 0])
df_train.groupby('windspeed')['count'].count()

### As we can see now that the windspeed is not 0 for any example

### As we can not use the categorical values as input for models like linear regression we will convert it to one hot vector using pd.get_dummies

In [None]:
categorical_columns = ['holiday', 'season', 'workingday', 'weather', 'month', 'time', 'year', 'day']
for category in categorical_columns:
    df_train = df_train.join(pd.get_dummies(df_train[category], prefix = category))
    
df_train.head()

### Below function is equaivalent to pd.get_dummies

In [None]:
'''
def one_hot_encode(dataframe, column):
    for i in dataframe.groupby(column).count().index:
        s = column + "_" + str(i)
        a = []
        for element in dataframe[column]:
            if element == i:
                a.append(1)
            else:
                a.append(0)
        dataframe[s] = a
    return dataframe
'''

In [None]:
df_train.columns

### Some algorithms performed better when the input data is normalized so we will normalize temp, humidity and windspeed

In [None]:
def normalize(dataframe, columns):
    for column in columns:
        dataframe[column]=((dataframe[column]-dataframe[column].min())/(dataframe[column].max()-dataframe[column].min()))
    return dataframe

In [None]:
df_train = normalize(df_train, columns=['temp', 'humidity', 'windspeed'])
df_train.head()

### As we have taken all the information from the datetime column we can remove that

In [None]:
def remove_columns(dataframe, columns):
    dataframe = dataframe.drop(columns, axis = 1)
    return dataframe

In [None]:
df_train = remove_columns(df_train, ['datetime', 'atemp']) 
print(df_train.columns)
df_train.head()

### Split the dataset into input and output

In [None]:
df_train_y = df_train[['count', 'casual', 'registered']]
df_train_x = remove_columns(df_train, ['casual', 'registered', 'count'])

### Split the dataset into training and testing

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.15, random_state=42)

### We will make 2 models in the first one we directly estimate the variable "Count" and in the 2nd one we will estimate "Casual" and "Registered" and "Count" will be equal to their summition

In [None]:
y_train_casual = y_train['casual']
y_train_registered = y_train['registered']
y_train_total = y_train['count']
y_test_casual = y_test['casual']
y_test_registered = y_test['registered']
y_test_total = y_test['count']

### Importing the Machine Learning Libraries

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

### Using Linear Regression

In [None]:
all_predictions = []

#as we have the one hot vector we will remove this categorical data
categorical_data = ['holiday', 'season', 'workingday', 'month', 'time', 'year', 'day']
lr_train_x = remove_columns(x_train, categorical_data)
lr_test_x = remove_columns(x_test, categorical_data)

lr = LinearRegression().fit(lr_train_x, y_train_total)
lr_predictions_on_test_data = np.exp(lr.predict(lr_test_x)) - 1

lr_predictions_on_train_data = np.exp(lr.predict(lr_train_x))

all_predictions.append(lr_predictions_on_train_data)
all_predictions.append(lr_predictions_on_test_data)

for i, prediction in enumerate(all_predictions):
    pre = []
    for p in prediction:
        if p < 0:
            pre.append(0)
        else:
            pre.append(p)
    if i == 0:
        print(np.sqrt(mean_squared_log_error( np.exp(y_train_total)-1, pre )))
    else:
        print(np.sqrt(mean_squared_log_error( np.exp(y_test_total)-1, pre )))

### For the Random Forest we do not need the one hot encoding for the categorical variables

In [None]:
all_predictions = []

training_columns = ['holiday', 'season', 'workingday', 'month', 'time', 'year', 'day', 'temp', 'humidity', 'windspeed']
train_x = x_train[training_columns]
test_x = x_test[training_columns]

rf = RandomForestRegressor(n_estimators=100, max_depth = 10, min_samples_split=5).fit(train_x, y_train_total)
predictions_on_test_data = np.exp(rf.predict(test_x)) - 1

predictions_on_train_data = np.exp(rf.predict(train_x))

all_predictions.append(predictions_on_train_data)
all_predictions.append(predictions_on_test_data)

for i, prediction in enumerate(all_predictions):
    pre = []
    for p in prediction:
        if p < 0:
            pre.append(0)
        else:
            pre.append(p)
    if i == 0:
        print(np.sqrt(mean_squared_log_error( np.exp(y_train_total)-1, pre )))
    else:
        print(np.sqrt(mean_squared_log_error( np.exp(y_test_total)-1, pre )))

### Random Forest with different model for "Casual" and "Registered"

In [None]:
all_predictions = []

training_columns = ['holiday', 'season', 'workingday', 'month', 'time', 'year', 'day', 'temp', 'humidity', 'windspeed']
train_x = x_train[training_columns]
test_x = x_test[training_columns]

rf_casual = RandomForestRegressor(n_estimators=300, max_depth = 10, min_samples_split=8).fit(train_x, y_train_casual)
predictions_casual = rf_casual.predict(test_x)


rf_registered = RandomForestRegressor().fit(train_x, y_train_registered)
predictions_registered = np.exp(rf_registered.predict(test_x))-1

predictions = predictions_casual + predictions_registered

predictions_casual_train = rf_casual.predict(train_x)
predictions_registered_train = np.exp(rf_registered.predict(train_x))-1

predictions_train = predictions_casual_train + predictions_registered_train

all_predictions.append(predictions_train)
all_predictions.append(predictions)

for i, prediction in enumerate(all_predictions):
    pre = []
    for p in prediction:
        if p < 0:
            pre.append(0)
        else:
            pre.append(p)
    if i == 0:
        print(np.sqrt(mean_squared_log_error( np.exp(y_train_total)-1, pre )))
    else:
        print(np.sqrt(mean_squared_log_error( np.exp(y_test_total)-1, pre )))

## Processing of the test data
### 1. Addition of the day, month, time, year
### 2. Removing the zero in the windspeed
### 3. Sorting the data according to datetime
### 4. Adding the one hot vector in case you want to predict the count using Linear Regression
### 5. Normalisation
### 6. Prediction
### 7. Storing

In [None]:
df_test.head()

In [None]:
df_test['month'] = add_month(df_test)
df_test['time'] = add_time(df_test)
df_test['year'] = add_year(df_test)
df_test['day'] = add_day(df_test)
df_test.head()

In [None]:
df_test_windspeed_0 = df_test[df_test['windspeed'] == 0]
df_test_windspeed_not_0 = df_test[df_test['windspeed'] != 0]
columns_for_windspeed = ['holiday', 'season', 'workingday', 'month', 'time', 'year', 'day', 'temp', 'humidity']

df_test_windspeed_0['windspeed'] = rf_windspeed.predict(df_test_windspeed_0[columns_for_windspeed])

df_test = df_test_windspeed_0.append(df_test_windspeed_not_0, sort = 'datetime')

In [None]:
df_test.head()

In [None]:
df_test = df_test.sort_values(by='datetime')
df_test.head()

In [None]:
categorical_columns = ['holiday', 'season', 'workingday', 'weather', 'month', 'time', 'year', 'day']
for category in categorical_columns:
    df_test = df_test.join(pd.get_dummies(df_test[category], prefix = category))
    
print(df_test.head())
print(df_test.columns)

In [None]:
df_test = normalize(df_test, columns=['temp', 'humidity', 'windspeed'])
df_test.head()

In [None]:
df_datetime = df_test['datetime']
df_test = remove_columns(df_test, ['datetime', 'atemp']) 
print(df_test.columns)

In [None]:
df_test.columns.shape == df_train_x.columns.shape

In [None]:
training_columns = ['holiday', 'season', 'workingday', 'month', 'time', 'year', 'day', 'temp', 'humidity', 'windspeed']
df_test_final = df_test[training_columns]
predictions = np.exp(rf.predict(df_test_final))-1

### If you want to predict using 2 models

In [None]:
'''
predictions_casual = rf_casual.predict(df_test_final)

predictions_registered = np.exp(rf_registered.predict(df_test_final))-1

predictions = predictions_casual + predictions_registered

print(predictions[:5])
'''


In [None]:
data = {'datetime': df_datetime, 'count': predictions}
df = pd.DataFrame(data)
df.head()

In [None]:
df.to_csv('submission.csv', index = False)

## ** While training for submitting use the entire dataset to train **