In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Importing important libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

# Loading the training dataset

In [None]:
#Reading the file
file = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")

In [None]:
file.describe()

# Checking for null values

In [None]:
file.isnull().sum(axis=0)

No data cleaning is required since no Null values are found!

In [None]:
file.columns

# Data visualization:

## Heatmap of all the continuous values in the file.

In [None]:
corr = file[['temp','atemp','humidity', 'windspeed','casual', 'registered','count']].corr()

In [None]:
f,axes = plt.subplots(1,1,figsize = (7,7))
sb.heatmap(corr,square=True,annot = True,linewidth = .5,center = 2,ax = axes)

### The conclusions drawn are:
1. We can see that temp and atemp have a very strong positive correlation therefore we can use only temp as a variable without any loss of information. 

2. We can infer from the correlaton matrix that windspeed has almost no correlation with the casual,registered or count which we wish to predict so we can remove that

## Data visualizaton for non continuous variables in data

First we have to separate the individual date and time for each data point into hour,day,month and year.

In [None]:
file = file
file['Date'] = pd.DatetimeIndex(file['datetime']).date
file['Hour'] = pd.DatetimeIndex(file['datetime']).hour
file['Day'] = pd.DatetimeIndex(file['datetime']).day
file['Month'] = pd.DatetimeIndex(file['datetime']).month
file['Year'] = pd.DatetimeIndex(file['datetime']).year

In [None]:
f,axes = plt.subplots(1,1,figsize = (35,11))
sb.lineplot(x ='Date', y = 'registered', data = file, hue = 'Hour',ax = axes,legend = 'full',palette = 'bright')

**The sudden periodic changes between the differrent regions is due to the missing data.These are the regions in which the regions we have to predict the result.**

## 1. Season

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'season'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

## 2. Holiday

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'holiday'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

## 3. Working day

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'workingday'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

Combining the graphs of casual and registered into one of may make it seem like that holiday and workingday have no dependence on count but we can clearly see that holiday increases the casual amount by upto 40% and a reverse kind of trend is observed in working day so it is reasonable to take two different models one for casual and another for registered.

Therefore what I will attempt to do is make two separate models for the casual and the registerd training them separately and then adding the result to get the count.

## 4. Weather

In [None]:
f,axes = plt.subplots(1,3,figsize = (17,7))
sb.despine(left = True)
x = 'weather'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

## 5. Date and Time

### 5.a. Hour

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Hour'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

We can see that in the final graph there are two prominent peaks.
1. At 8 a.m
2. At 5 p.m
Where as the trend of casual is mostly the same throughout the day. 

From this we can conclude that the registered are mostly people going on their jobs which explains the peaks at the start and end of office hours. Clearly these people would have a more definite and predictible schedule and are therefore more likely to be registered.

### 5.b. Day

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Day'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

**From the above graphs we can conclude that the feature day has hardly any influence over the features registered and count.**

### 5.c. Month

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Month'
#order = ['January','February','March','April','May','June','July','August','September','October','November','December']
plot = sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0])
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

### 5.d. Year 

In [None]:
f,axes = plt.subplots(1,3,figsize = (19,7))
sb.despine(left = True)
x = 'Year'

sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax =  axes[0] ,)
sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1])
sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])

In [None]:
file.describe()

In [None]:
file.columns

# Adding relevant columns for each of the categorical data columns and removing unnecesary ones

## 1. Season

In [None]:
for i in file.groupby('season').count().index:
    s = 's'+str(i)
    a=[]
    for j in file.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)

## 2. Weather 

In [None]:
for i in file.groupby('weather').count().index:
    s = 'w'+str(i)
    a=[]
    for j in file.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)

## 3. Hour

In [None]:
for i in file.groupby('Hour').count().index:
    s = 'Hour'+str(i)
    a=[]
    for j in file.Hour:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s]=a
file.sample(5)

## 4.Month

In [None]:
for i in file.groupby("Month").count().index:
    s = 'Month' + str(i)
    a = []
    for j in file.Month:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    file[s] = a
file.sample(5)

In [None]:
file.columns

## Removing unnecessary columns

In [None]:
feed = file[['Hour0', 'Hour1', 'Hour2', 'Hour3', 'Hour4', 'Hour5',
       'Hour6', 'Hour7', 'Hour8', 'Hour9', 'Hour10', 'Hour11', 'Hour12',
       'Hour13', 'Hour14', 'Hour15', 'Hour16', 'Hour17', 'Hour18', 'Hour19',
       'Hour20', 'Hour21', 'Hour22', 'Hour23','Month1', 'Month2', 'Month3',
       'Month4', 'Month5', 'Month6', 'Month7', 'Month8', 'Month9', 'Month10',
       'Month11', 'Month12','Year','s1','s2','s3','s4','holiday','workingday',
        'w1','w2','w3','w4','temp','humidity','casual','registered','count']]

In [None]:
feed.describe()

In [None]:
feed.columns

# Preparing training and testing sets

### 1. Training set

In [None]:
df_train_x = feed.drop('casual',axis = 1).drop('registered',axis=1).drop('count',axis=1)
df_train_x.describe()

### 2. Training set

In [None]:
df_train_y = feed['count']
df_train_y.describe

# Machine learning model

### Splitting data into train and test sets

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.15, random_state=42)

### Using polynomial on the dataset

In [None]:
reg = LinearRegression().fit(x_train, y_train)
predictions = reg.predict(x_test)

for i in range(len(predictions)):
    predictions[i] = max(1,i)
    predictions[i] = min(800,i)

print(np.sqrt(mean_squared_log_error( y_test, predictions )))

Since we know that the output is never less than 1 we replace all negative values with 1 before appending in pre to calculate error.

In [None]:
plt = sb.residplot(x = predictions, y = y_test,lowess = True,color = 'r')

### Retraining the decision tree over the whole dataset for submission.

In [None]:
reg = LinearRegression().fit(df_train_x, df_train_y)

### Reading the test file

In [None]:
test_file = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')

In [None]:
test=test_file
test.describe()

## Processing of the test file

In [None]:
test['mth'] = pd.DatetimeIndex(test['datetime']).month
test['yr'] = pd.DatetimeIndex(test['datetime']).year
test['dy'] = pd.DatetimeIndex(test['datetime']).day
test['hr'] = pd.DatetimeIndex(test['datetime']).hour

for i in test.groupby("season").count().index:
    s = 's' + str(i)
    a = []
    for j in test.season:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby("weather").count().index:
    s = 'w' + str(i)
    a = []
    for j in test.weather:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
for i in test.groupby('hr').count().index:
    s = 'hr'+str(i)
    a=[]
    for j in test.hr:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s]=a
for i in test.groupby("mth").count().index:
    s = 'm' + str(i)
    a = []
    for j in test.mth:
        if j==i:
            a.append(1)
        else:
            a.append(0)
    test[s] = a
test.sample(10)

In [None]:
test = test[['hr0','hr1','hr2','hr3','hr4','hr5','hr6','hr7','hr8','hr9','hr10','hr11','hr12','hr13','hr14','hr15','hr16','hr17','hr18',
                 'hr19','hr20','hr21','hr22','hr23','m1','m2','m3','m4','m5','m6','m7','m8','m9','m10','m11','m12','yr',
                 's1','s2','s3','s4','holiday','workingday','w1','w2','w3','w4','temp','humidity']]
test.describe

## Predicting the output over test set

In [None]:
final_predictions = pd.DataFrame()
final_predictions['cout'] = reg.predict(test)

In [None]:
s=[]
for j in final_predictions.cout:
    if int(j)<1:
        s.append(1)
    else:
        s.append(j)
final_predictions['count'] = s 

**Since we know that the output is never less than 1 we have to replace all negative values with 1.**

In [None]:
final_predictions.describe

In [None]:
final_predictions['datetime']=test_file['datetime']
final_predictions = final_predictions[['datetime','count']]

In [None]:
final_predictions.describe()

## Exporting output to csv

In [None]:
final_predictions.to_csv('submission.csv',index=False)