# Solution Guide

### Importing libraries

In [None]:

import pandas
import numpy as numpy
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from numpy import loadtxt
from xgboost import XGBRegressor


### Loading the data

In [None]:

bike_rentals = pandas.read_csv("bike_sharing_dataset.csv")

### Exploring the data

In [None]:
bike_rentals.shape

In [None]:
bike_rentals.info()

In [None]:
bike_rentals.head()

In [None]:
#Make a histogram of the cnt column of bike_rentals, and take a look at the distribution of total rentals.

plt.hist(bike_rentals["cnt"])

In [None]:
#Use the corr method on the bike_rentals dataframe to explore how each column is correlated with cnt

bike_rentals.corr()["cnt"]

In [None]:
def assign_label(hour):
    if hour >=0 and hour < 6:
        return 4
    elif hour >=6 and hour < 12:
        return 1
    elif hour >= 12 and hour < 18:
        return 2
    elif hour >= 18 and hour <=24:
        return 3

bike_rentals["time_label"] = bike_rentals["hr"].apply(assign_label)

**Error metric**

The mean squared error metric makes the most sense to evaluate our error. MSE works on continuous numeric data, which fits our data quite well.

In [None]:
train = bike_rentals.sample(frac=.8)

In [None]:
test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]

### First baseline model

In [None]:


predictors = list(train.columns)
predictors.remove("cnt")
predictors.remove("casual")
predictors.remove("registered")
predictors.remove("dteday")

reg = LinearRegression()

reg.fit(train[predictors], train["cnt"])

In [None]:
import numpy
predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)

The error is very high, which may be due to the fact that the data has a few extremely high rental counts, but otherwise mostly low counts. Larger errors are penalized more with MSE, which leads to a higher total error.

### Using a decision tree as second model (optional)

In [None]:
reg = DecisionTreeRegressor(min_samples_leaf=5)

reg.fit(train[predictors], train["cnt"])


In [None]:
predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)


In [None]:
reg = DecisionTreeRegressor(min_samples_leaf=2)

reg.fit(train[predictors], train["cnt"])

predictions = reg.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)


By taking the nonlinear predictors into account, the decision tree regressor appears to have much higher accuracy than linear regression.

**XgBoost**

In [None]:
model = XGBRegressor()
model.fit(train[predictors], train["cnt"])


In [None]:
predictions = model.predict(test[predictors])

numpy.mean((predictions - test["cnt"]) ** 2)
