# Python Code

# Step 1: Loading a file and preparing the data

Here, we repeat the setup steps from the last lesson. 

In [None]:
import matplotlib.pyplot as plt #Matplotlib allows us to draw graphs
import numpy as np #Numpy allows us to perform complex mathematical processes quickly
import pandas as pd #Pandas is another useful set of tools for statistics
import datetime
        
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#load the bike hire data from the CSV file
data = pd.read_csv("/kaggle/input/london-bike-hire/bike_hire.csv")
data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S')


# Step 2 - Transforming the data into daily summaries

In [None]:
# forecasting

from fbprophet import Prophet

hire_data = data.groupby(data.timestamp.dt.date)['count'].sum()
temp_data = data.groupby(data.timestamp.dt.date)['t1'].max()
combined_data = pd.concat([hire_data, temp_data], axis=1)
combined_data = combined_data.reset_index()
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'], format='%Y-%m-%d %H:%M:%S')

# Step 3 - Preparing the data for modelling

We need to break the data into training and testing, to see how well the forecasting algorithm works.

In [None]:
train_data = combined_data[['timestamp', 'count', 't1']][(combined_data['timestamp']> "2015-01-01") & (combined_data['timestamp']< "2016-01-01")]
train_data.columns = ['ds', 'y', 'temp']
test_data = combined_data[['timestamp', 'count', 't1']][(combined_data['timestamp']> "2016-01-02") & (combined_data['timestamp']< "2017-01-01")]
test_data.columns = ['ds', 'y', 'temp']

train_data.head()
test_data.head()

# Step 3 - Training a forecasting model

Here, we will use a forecasting package called Prophet to train a model to predict bike hire rates. 

In [None]:
model = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=True)
model.fit(train_data)

forecast = model.predict(test_data)
fig = model.plot(forecast)


We can break the model down into different components - an overall trend, weekly trend, yearly trend, and daily trend.

In [None]:
fig = model.plot_components(forecast)

We can also compare the forecasted bike hires from 2016 to the real bike hire rates. First we can use a scatter plot to view the correlation between forecasted and real rates. 

In [None]:
plt.scatter(x=test_data['y'], y=forecast['yhat'])

Next, we can plot real and predicted bike hires over time. 

In [None]:
fig, ax1 = plt.subplots()
# rotate the date labels so they don't overlap
plt.xticks( rotation=25 )
# set up the 2nd axis
ax2 = ax1.twinx()  

ax1.plot(test_data['ds'], test_data['y'], color='blue')
ax1.set_xlabel('Timestamp')
ax1.set_ylabel('Bike hires')

ax2.plot(forecast['ds'], forecast['yhat'], color='red')
print(combined_data['timestamp'].max())
ax1.xaxis.set_major_formatter(xfmt)


The predictions look quite good, but there's a strange predicted drop in hires around September which doesn't show up in the real data. Perhaps including temperature data in the model will improve the predictions?

In [None]:
model = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=True)
# here, we add temperature in as a predictor for the model
model.add_regressor('temp')
model.fit(train_data)

forecast = model.predict(test_data)
fig = model.plot(forecast)


In [None]:
import matplotlib.dates as md
xfmt = md.DateFormatter('%Y-%m')

fig, ax1 = plt.subplots()
# rotate the date labels so they don't overlap
plt.xticks( rotation=25 )
# set up the 2nd axis
ax2 = ax1.twinx()  

ax1.plot(test_data['ds'], test_data['y'], color='blue')
ax1.set_xlabel('Timestamp')
ax1.set_ylabel('Bike hires')

ax2.plot(forecast['ds'], forecast['yhat'], color='red')
print(combined_data['timestamp'].max())
ax1.xaxis.set_major_formatter(xfmt)


Looks like this has resolved the problem. 