# Notebook to create a simple linear regression model

- Using what we learned earlier in the class, we are going to train a simple linear regression model
- Once we have trained the model, we will save it as a pickle file so we can load it in Flask module and serve it

In [2]:
import pandas as pd
import numpy as np

In [11]:
# ingest the dataset
# Read the data and set the datetime as the index.
url = 'data/bikeshare.csv'
bikes = pd.read_csv(url, index_col='datetime', parse_dates=True)
bikes.rename(columns={'count':'total_rentals'}, inplace=True)

In [12]:
season_dummies = pd.get_dummies(bikes.season, prefix='season')

In [14]:
bikes.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,total_rentals
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [15]:
# merge the two dataframes
bikes_dummies = pd.concat([bikes, season_dummies], axis=1)

In [16]:
bikes_dummies.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,total_rentals,season_1,season_2,season_3,season_4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0,0,0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,0,0,0
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1,0,0,0
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1,0,0,0
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,1,0,0,0


In [17]:
# import required libraries
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression

In [18]:
# prepare data
feature_cols = ['temp', 'season_2', 'season_3', 'season_4', 'humidity']
X = bikes_dummies[feature_cols]
y = bikes_dummies.total_rentals
# segment the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
# Train the model

linreg = LinearRegression()
linreg.fit(X_train, y_train)
linreg.score(X_test,y_test)

0.257841248318853

In [20]:
linreg.score(X_train,y_train)

0.27946440655452554

In [21]:
# save the model as a pickle file
import pickle
filename = 'bike_model.sav'
pickle.dump(linreg, open(filename, 'wb'))

In [23]:
# test importing the model from the pickle file
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.257841248318853


In [24]:
# now let's exercise it with a new datapoint
datapoint = np.array([15,0,0,0,80])

In [25]:
datapoint

array([15,  0,  0,  0, 80])

In [27]:
# create a single-row dataframe to exercise predict
one_row_X = pd.DataFrame(columns=feature_cols)
one_row_X.loc[0] = [15,0,0,0,80]

In [28]:
one_row_X.head()

Unnamed: 0,temp,season_2,season_3,season_4,humidity
0,15,0,0,0,80


In [29]:
# get a prediction on the single-row dataframe
pred_point = loaded_model.predict(one_row_X)

In [30]:
pred_point

array([77.22309229])

In [31]:
pred_point[0]

77.22309228699393