In [71]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [62]:
df = pd.read_csv('nyc-east-river-bicycle-counts.csv')

In [45]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Day,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total
0,0,01-04-2016 00:00,01-04-2016 00:00,78.1,66.0,0.01,1704,3126,4115,2552,11497
1,1,02-04-2016 00:00,02-04-2016 00:00,55.0,48.9,0.15,827,1646,2565,1884,6922
2,2,03-04-2016 00:00,03-04-2016 00:00,39.9,34.0,0.09,526,1232,1695,1306,4759
3,3,04-04-2016 00:00,04-04-2016 00:00,44.1,33.1,0.47 (S),521,1067,1440,1307,4335
4,4,05-04-2016 00:00,05-04-2016 00:00,42.1,26.1,0,1416,2617,3081,2357,9471


Pre - Process DataSet

In [46]:
df.shape

(210, 11)

In [47]:
# removing NA
df = df.dropna()

In [48]:
df.shape

(210, 11)

In [53]:
# Checking and removing duplicates

df.duplicated().sum()
df.drop_duplicates()

Unnamed: 0.1,Unnamed: 0,Date,Day,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total
0,0,01-04-2016 00:00,01-04-2016 00:00,78.1,66.0,0.01,1704,3126,4115,2552,11497
1,1,02-04-2016 00:00,02-04-2016 00:00,55.0,48.9,0.15,827,1646,2565,1884,6922
2,2,03-04-2016 00:00,03-04-2016 00:00,39.9,34.0,0.09,526,1232,1695,1306,4759
3,3,04-04-2016 00:00,04-04-2016 00:00,44.1,33.1,0.47 (S),521,1067,1440,1307,4335
4,4,05-04-2016 00:00,05-04-2016 00:00,42.1,26.1,0,1416,2617,3081,2357,9471
...,...,...,...,...,...,...,...,...,...,...,...
205,205,26-04-2016 00:00,26-04-2016 00:00,60.1,46.9,0.24,1997,3520,4559,2929,13005
206,206,27-04-2016 00:00,27-04-2016 00:00,62.1,46.9,0,3343,5606,6577,4388,19914
207,207,28-04-2016 00:00,28-04-2016 00:00,57.9,48.0,0,2486,4152,5336,3657,15631
208,208,29-04-2016 00:00,29-04-2016 00:00,57.0,46.9,0.05,2375,4178,5053,3348,14954


In [54]:
# Removing outliers

Q1 = np.percentile(df['High Temp (°F)'], 25,
                   interpolation = 'midpoint')
 
Q3 = np.percentile(df['High Temp (°F)'], 75,
                   interpolation = 'midpoint')
IQR = Q3 - Q1
 
print("Old Shape: ", df.shape)
 
# Upper bound
upper = np.where(df['High Temp (°F)'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df['High Temp (°F)'] <= (Q1-1.5*IQR))
 
''' Removing the Outliers '''
df['High Temp (°F)'].drop(upper[0], inplace = True)
df['High Temp (°F)'].drop(lower[0], inplace = True)

############################################################

Q1 = np.percentile(df['Low Temp (°F)'], 25,
                   interpolation = 'midpoint')
 
Q3 = np.percentile(df['Low Temp (°F)'], 75,
                   interpolation = 'midpoint')
IQR = Q3 - Q1
 
print("Old Shape: ", df.shape)
 
# Upper bound
upper = np.where(df['Low Temp (°F)'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(df['Low Temp (°F)'] <= (Q1-1.5*IQR))
 
''' Removing the Outliers '''
df['Low Temp (°F)'].drop(upper[0], inplace = True)
df['Low Temp (°F)'].drop(lower[0], inplace = True)

 

Old Shape:  (210, 11)
Old Shape:  (210, 11)


In [64]:
df.drop(df[df['Precipitation'].str.isalpha()].index, inplace = True)
df.drop(df[df['Precipitation'].str.contains(' ')].index, inplace = True)
print(df.shape)
df

(196, 11)


Unnamed: 0.1,Unnamed: 0,Date,Day,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total
0,0,01-04-2016 00:00,01-04-2016 00:00,78.1,66.0,0.01,1704,3126,4115,2552,11497
1,1,02-04-2016 00:00,02-04-2016 00:00,55.0,48.9,0.15,827,1646,2565,1884,6922
2,2,03-04-2016 00:00,03-04-2016 00:00,39.9,34.0,0.09,526,1232,1695,1306,4759
4,4,05-04-2016 00:00,05-04-2016 00:00,42.1,26.1,0,1416,2617,3081,2357,9471
5,5,06-04-2016 00:00,06-04-2016 00:00,45.0,30.0,0,1885,3329,3856,2849,11919
...,...,...,...,...,...,...,...,...,...,...,...
205,205,26-04-2016 00:00,26-04-2016 00:00,60.1,46.9,0.24,1997,3520,4559,2929,13005
206,206,27-04-2016 00:00,27-04-2016 00:00,62.1,46.9,0,3343,5606,6577,4388,19914
207,207,28-04-2016 00:00,28-04-2016 00:00,57.9,48.0,0,2486,4152,5336,3657,15631
208,208,29-04-2016 00:00,29-04-2016 00:00,57.0,46.9,0.05,2375,4178,5053,3348,14954


In [21]:
# Can normalize all features, but that doesn't make sense in this scenario. So ignoring normalization
# Attribute Selection using p-value
# Encoding doesn't apply here

In [67]:
#Splitting 80:20
x = df.iloc[:, 3:6]
y = df.iloc[:, 10]

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.2, random_state=0)  

In [68]:
df['High Temp (°F)'] = df['High Temp (°F)'].astype(float)
df['Low Temp (°F)'] = df['Low Temp (°F)'].astype(float)
df['Precipitation'] = df['Precipitation'].astype(float)
#df['High Temp (°F)'] = df['High Temp (°F)'].astype(float)

In [69]:
regressor = LinearRegression()
regressor.fit(x_train,y_train)

LinearRegression()

In [70]:
y_pred = regressor.predict(x_test) 
y_pred

array([17026.72317105, 16518.85963137, 11712.74719287,  9473.60838413,
       10347.40940262, 17763.18321688,  8766.7693924 , 16518.85963137,
       18342.18289381, 18342.18289381,  7500.16248761, 11712.74719287,
       17212.93595949, 18437.67663147,  6592.88634422, 17212.93595949,
       14944.30425284, 11184.13787299, 22832.28790251,  9473.60838413,
       20851.8573255 , 11712.74719287, 16409.04183307, 11712.74719287,
        7500.16248761, 17026.72317105, 13164.91039802, 13164.91039802,
        7500.16248761, 18814.96253086, 17763.18321688, 20614.32672325,
        8920.13212976, 16162.98086041, 11184.13787299, 16409.04183307,
        8920.13212976, 17763.18321688, 13897.91359326, 13164.91039802])

In [73]:
print('Train Score: ', regressor.score(x_train, y_train))  
print('Test Score: ', regressor.score(x_test, y_test))  

Train Score:  0.6694588568812996
Test Score:  0.7496527598769736


In [74]:
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures 

In [82]:
poly_regs= PolynomialFeatures(degree= 2, include_bias=False)  
x_poly= poly_regs.fit_transform(x) 
x_train, x_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.2, random_state=42)
lin_reg_2 =LinearRegression()  
lin_reg_2.fit(x_train, y_train)  

LinearRegression()

In [84]:
poly_pred = lin_reg_2.predict(x_test)
print(poly_pred)  

[18731.61962856  8260.96066511 21171.76467776 19899.21114496
 18273.4534163  17088.36732959 17879.77470438 18731.61962856
 19497.23962171  8260.96066511 21171.76467776 18814.94268689
 19211.45618746 10976.21485273  8292.96197415  3202.11403645
  9571.70759172 16974.26325186 17088.36732959 19899.21114496
 17357.34687571 16974.26325186 12293.80876782 17088.36732959
  8260.96066511 16074.79756439 21171.76467776  9571.70759172
  3202.11403645 17903.44305823 12293.80876782  5160.21849801
 18814.94268689 18342.30852017 17673.94164919 16074.79756439
 17357.34687571 19899.21114496  8821.10111729 12625.31124378]


In [85]:
print('Train Score: ', lin_reg_2.score(x_train, y_train))  
print('Test Score: ', lin_reg_2.score(x_test, y_test))  

Train Score:  0.818203754927706
Test Score:  0.7809262415956677
