In [85]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os



import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.stats.diagnostic import acorr_ljungbox

from sklearn.metrics import mean_squared_error


# This one is only available in the development verison of statsmodels
# Run:
#    pip install git+https://github.com/statsmodels/statsmodels.git
# to install the development version.
from statsmodels.tsa.statespace.sarimax import SARIMAX


# grid search sarima hyperparameters
from math import sqrt
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

from sklearn.tree import DecisionTreeRegressor


from scipy import signal
from scipy import stats

sys.path.append("..")
from eda_utilities import print_eda_stats
from data_pipeline import dockless_data_pipeline
from data_pipeline import weather_data_pipeline

In [2]:
# Read the dockless data from S3 bucket
# Load the data and and call the clean method automatically to call the work flow
dockless_data = pd.read_csv("https://s3.amazonaws.com/sameera-bucket-1/dockless_mobility/raw_data/Austin_Dockless_Vehicle_Trips.csv")
dockless_vehicles_columns = dockless_data.columns
print_eda_stats(dockless_data, dockless_vehicles_columns, "Austin Dockless Mobility Data")
# Call the dockless_data_pipeline()
dockless_data_pipeline(dockless_data)
dockless_vehicles_columns = dockless_data.columns
print_eda_stats(dockless_data, dockless_vehicles_columns, "Austin Dockless Mobility Data")

  interactivity=interactivity, compiler=compiler, result=result)


+--------------------------+---------+-------------+---------------+--------------+-------------------+
|       Column Name        |  dtype  | Total_Count | Unique_Values | df_totalrows | Null Value counts |
+--------------------------+---------+-------------+---------------+--------------+-------------------+
|            ID            |  object |   2746505   |    2746505    |   2746505    |         0         |
|        Device ID         |  object |   2746505   |     26999     |   2746505    |         0         |
|       Vehicle Type       |  object |   2746504   |       3       |   2746505    |         1         |
|      Trip Duration       | float64 |   2746504   |     10521     |   2746505    |         1         |
|      Trip Distance       | float64 |   2746504   |     16125     |   2746505    |         1         |
|        Start Time        |  object |   2746504   |     25694     |   2746505    |         1         |
|         End Time         |  object |   2746504   |     25768  

In [3]:
#Read the weather data from S3 bucket
weather_data = pd.read_csv("https://s3.amazonaws.com/sameera-bucket-1/dockless_mobility/raw_data/Austin_Bergstom_Airport_Weather.csv")
weather_columns = weather_data.columns
print_eda_stats(weather_data, weather_columns, "Austin Weather Data")
weather_data_pipeline(weather_data)
weather_columns = weather_data.columns
print_eda_stats(weather_data, weather_columns, "Austin Weather Data")

+-----------------+---------+-------------+---------------+--------------+-------------------+
|   Column Name   |  dtype  | Total_Count | Unique_Values | df_totalrows | Null Value counts |
+-----------------+---------+-------------+---------------+--------------+-------------------+
|     STATION     |  object |     436     |       1       |     436      |         0         |
|       NAME      |  object |     436     |       1       |     436      |         0         |
|     LATITUDE    | float64 |     436     |       1       |     436      |         0         |
|    LONGITUDE    | float64 |     436     |       1       |     436      |         0         |
|    ELEVATION    | float64 |     436     |       1       |     436      |         0         |
|       DATE      |  object |     436     |      436      |     436      |         0         |
|       AWND      | float64 |     435     |       73      |     436      |         1         |
| AWND_ATTRIBUTES |  object |     435     |       

In [4]:
# Drop all records where  trip distance is less .1 miles and greater than 500 miles (1609.34 mts - 804673 mts)
dockless_data = dockless_data[(dockless_data['Trip Distance'] > 1609.34) & (dockless_data['Trip Distance'] < 804673)]
dockless_data = dockless_data[(dockless_data['Trip Duration'] > 60) & (dockless_data['Trip Duration'] < 86400)]

In [5]:
# Add to pipeline so it easy for calculations (Done)
# dockless_data['count'] = 1

# Have only weather data which is from the start date of the scooter data
trip_counts = dockless_data.groupby('START_DATE').count()['count']
weather_data= weather_data[weather_data['DATE'] >= trip_counts.index.min()]
weather_data= weather_data[weather_data['DATE'] <= trip_counts.index.max()]
weather_data = weather_data.reset_index()

In [6]:
grouped_origin_cell_id = dockless_data.groupby(['Origin Cell ID']).agg('count')
grouped_origin_cell_id = grouped_origin_cell_id.sort_values(['Trip Duration'], ascending=False)
top_origin_cell_ids = grouped_origin_cell_id['Trip Duration']
# Rename the column Trip Duration to Trip Counts
top_origin_cell_ids = top_origin_cell_ids.to_frame().reset_index()
top_50_origin_cells = top_origin_cell_ids.head(50)
#top_50_origin_cells.to_csv("../../output/EDA/Top_50_Origin_Cells_Total_Rides.csv")
top_10_origin_cells = top_50_origin_cells[0:10]
top_10_origin_cells = top_10_origin_cells['Origin Cell ID'].values
top_10_origin_cells_list = top_10_origin_cells.tolist()

In [7]:
# Make the DateTime as index
dockless_data = dockless_data.reset_index()
dockless_data = dockless_data.set_index("Start Time")
origin_cell_data = dockless_data[['Origin Cell ID', 'count']]

In [8]:
top_10_origin_cell_data = origin_cell_data[origin_cell_data['Origin Cell ID'].isin(top_10_origin_cells_list)]

In [9]:
# Top 3 origin cell data
cell_014706_data = top_10_origin_cell_data[top_10_origin_cell_data['Origin Cell ID'] == '014706']
cell_013650_data = top_10_origin_cell_data[top_10_origin_cell_data['Origin Cell ID'] == '013650']
cell_013176_data = top_10_origin_cell_data[top_10_origin_cell_data['Origin Cell ID'] == '013176']

In [10]:
trip_counts_cell_014706 = cell_014706_data.groupby([cell_014706_data.index.get_level_values(0),'Origin Cell ID']).count()
trip_counts_cell_014706 = trip_counts_cell_014706.unstack(level=1)
trip_counts_cell_014706 = trip_counts_cell_014706.fillna(0)

# Make a regular dataframe for processing the Time Series
t2 = trip_counts_cell_014706.reset_index()['count']
counts = t2['014706'].values
data = {'Start Time':trip_counts_cell_014706.index.values, '014706':counts} 
trip_counts_cell_014706_new = pd.DataFrame(data)
data_014706 = trip_counts_cell_014706_new.set_index("Start Time")



In [12]:
# data after '2018-07-15' as the data is not consistent before that

data_014706 = data_014706[(data_014706.index > '2018-07-15')]
data_014706.head(5)

Unnamed: 0_level_0,014706
Start Time,Unnamed: 1_level_1
2018-07-15 00:30:00,1
2018-07-15 07:45:00,1
2018-07-15 09:15:00,1
2018-07-15 09:30:00,2
2018-07-15 10:45:00,1


In [13]:
# Resample for Hourly and fill with sum of the rides
data_014706_hourly = data_014706.resample('H', how='sum')
data_014706_hourly = data_014706_hourly.fillna(0)
data_014706_hourly.head(20)

the new syntax is .resample(...).sum()
  


Unnamed: 0_level_0,014706
Start Time,Unnamed: 1_level_1
2018-07-15 00:00:00,1
2018-07-15 01:00:00,0
2018-07-15 02:00:00,0
2018-07-15 03:00:00,0
2018-07-15 04:00:00,0
2018-07-15 05:00:00,0
2018-07-15 06:00:00,0
2018-07-15 07:00:00,1
2018-07-15 08:00:00,0
2018-07-15 09:00:00,3


In [20]:
data_014706_hourly.tail(20)

Unnamed: 0_level_0,014706
Start Time,Unnamed: 1_level_1
2019-02-07 22:00:00,0
2019-02-07 23:00:00,0
2019-02-08 00:00:00,1
2019-02-08 01:00:00,1
2019-02-08 02:00:00,0
2019-02-08 03:00:00,0
2019-02-08 04:00:00,0
2019-02-08 05:00:00,0
2019-02-08 06:00:00,0
2019-02-08 07:00:00,0


In [27]:
# Add the following features to the data 
'''
Month, Day of the week(M,T, W, T, F, S, Sun), Weekday, Weekend, Week, Hour, Year, Max Temp, Precipitation, 
The day of the week with Monday=0, Sunday=6.
'''
data_014706_hourly['MONTH'] = pd.DatetimeIndex(data_014706_hourly.index).month
data_014706_hourly['YEAR'] = pd.DatetimeIndex(data_014706_hourly.index).year
data_014706_hourly['HOUR'] = pd.DatetimeIndex(data_014706_hourly.index).hour
data_014706_hourly['DAY'] = pd.DatetimeIndex(data_014706_hourly.index).day
data_014706_hourly['WEEK'] = pd.DatetimeIndex(data_014706_hourly.index).week
data_014706_hourly['DAY_OF_WEEK'] = pd.DatetimeIndex(data_014706_hourly.index).weekday
data_014706_hourly['WEEKEND'] = ((pd.DatetimeIndex(data_014706_hourly.index).weekday) // 5 == 1).astype(float)
data_014706_hourly['WEEKDAY'] = ((pd.DatetimeIndex(data_014706_hourly.index).weekday) // 5 == 0).astype(float)
data_014706_hourly['DATE'] = pd.DatetimeIndex(data_014706_hourly.index).date

In [28]:
data_014706_hourly.head(5)

Unnamed: 0_level_0,014706,MONTH,YEAR,HOUR,DAY,WEEK,DAY_OF_WEEK,WEEKEND,WEEKDAY,DATE
Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-07-15 00:00:00,1,7,2018,0,15,28,6,1.0,0.0,2018-07-15
2018-07-15 01:00:00,0,7,2018,1,15,28,6,1.0,0.0,2018-07-15
2018-07-15 02:00:00,0,7,2018,2,15,28,6,1.0,0.0,2018-07-15
2018-07-15 03:00:00,0,7,2018,3,15,28,6,1.0,0.0,2018-07-15
2018-07-15 04:00:00,0,7,2018,4,15,28,6,1.0,0.0,2018-07-15


In [32]:
# Merge the weather data into the hourly dataframe so you can add all the features

merged_data_014706_hourly = pd.merge(left=data_014706_hourly,right=weather_data, how='left',left_on='DATE', right_on='DATE')

merged_data_014706_hourly.head(5)


Unnamed: 0,014706,MONTH,YEAR,HOUR,DAY,WEEK,DAY_OF_WEEK,WEEKEND,WEEKDAY,DATE,index,AVG_DAILY_WIND_SPEED,PRECIPITATION,SNOW,AVG_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE
0,1,7,2018,0,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
1,0,7,2018,1,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2,0,7,2018,2,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
3,0,7,2018,3,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
4,0,7,2018,4,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0


In [38]:
merged_data_014706_hourly = data_014706_hourly.reset_index().merge(weather_data, on='DATE', how="left").set_index('Start Time')


merged_data_014706_hourly.head(10)


Unnamed: 0_level_0,014706,MONTH,YEAR,HOUR,DAY,WEEK,DAY_OF_WEEK,WEEKEND,WEEKDAY,DATE,index,AVG_DAILY_WIND_SPEED,PRECIPITATION,SNOW,AVG_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE
Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-07-15 00:00:00,1,7,2018,0,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 01:00:00,0,7,2018,1,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 02:00:00,0,7,2018,2,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 03:00:00,0,7,2018,3,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 04:00:00,0,7,2018,4,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 05:00:00,0,7,2018,5,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 06:00:00,0,7,2018,6,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 07:00:00,1,7,2018,7,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 08:00:00,0,7,2018,8,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 09:00:00,3,7,2018,9,15,28,6,1.0,0.0,2018-07-15,195,10.29,0.0,0.0,86.0,98.0,75.0


In [29]:
weather_data.head(5)

Unnamed: 0,index,DATE,AVG_DAILY_WIND_SPEED,PRECIPITATION,SNOW,AVG_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE
0,93,2018-04-04,12.53,0.0,0.0,62.0,71.0,49.0
1,94,2018-04-05,9.17,0.0,0.0,64.0,78.0,53.0
2,95,2018-04-06,12.75,0.3,0.0,75.0,86.0,64.0
3,96,2018-04-07,17.22,0.01,0.0,57.0,64.0,41.0
4,97,2018-04-08,6.49,0.0,0.0,47.0,65.0,41.0


In [61]:
# Create the test train split the same way as the univariate time series so this is the same as ARIMA series
size = int(len(merged_data_014706_hourly) * 0.66)
train, test = merged_data_014706_hourly[0:size], merged_data_014706_hourly[size:len(merged_data_014706_hourly)]

In [62]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3306 entries, 2018-07-15 00:00:00 to 2018-11-29 17:00:00
Data columns (total 17 columns):
014706                  3306 non-null int64
MONTH                   3306 non-null int64
YEAR                    3306 non-null int64
HOUR                    3306 non-null int64
DAY                     3306 non-null int64
WEEK                    3306 non-null int64
DAY_OF_WEEK             3306 non-null int64
WEEKEND                 3306 non-null float64
WEEKDAY                 3306 non-null float64
DATE                    3306 non-null object
index                   3306 non-null int64
AVG_DAILY_WIND_SPEED    3306 non-null float64
PRECIPITATION           3306 non-null float64
SNOW                    3306 non-null float64
AVG_TEMPERATURE         3306 non-null float64
MAX_TEMPERATURE         3306 non-null float64
MIN_TEMPERATURE         3306 non-null float64
dtypes: float64(8), int64(8), object(1)
memory usage: 464.9+ KB


In [63]:
test.head(5)

Unnamed: 0_level_0,014706,MONTH,YEAR,HOUR,DAY,WEEK,DAY_OF_WEEK,WEEKEND,WEEKDAY,DATE,index,AVG_DAILY_WIND_SPEED,PRECIPITATION,SNOW,AVG_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE
Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-11-29 18:00:00,16,11,2018,18,29,48,3,0.0,1.0,2018-11-29,332,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 19:00:00,9,11,2018,19,29,48,3,0.0,1.0,2018-11-29,332,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 20:00:00,2,11,2018,20,29,48,3,0.0,1.0,2018-11-29,332,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 21:00:00,0,11,2018,21,29,48,3,0.0,1.0,2018-11-29,332,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 22:00:00,2,11,2018,22,29,48,3,0.0,1.0,2018-11-29,332,10.51,0.0,0.0,61.0,76.0,47.0


In [64]:
train.pop('DATE')
train.pop('index')
train.head(5)

Unnamed: 0_level_0,014706,MONTH,YEAR,HOUR,DAY,WEEK,DAY_OF_WEEK,WEEKEND,WEEKDAY,AVG_DAILY_WIND_SPEED,PRECIPITATION,SNOW,AVG_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE
Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-07-15 00:00:00,1,7,2018,0,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 01:00:00,0,7,2018,1,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 02:00:00,0,7,2018,2,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 03:00:00,0,7,2018,3,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 04:00:00,0,7,2018,4,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0


In [65]:
test.pop('DATE')
test.pop('index')
test.head(5)

Unnamed: 0_level_0,014706,MONTH,YEAR,HOUR,DAY,WEEK,DAY_OF_WEEK,WEEKEND,WEEKDAY,AVG_DAILY_WIND_SPEED,PRECIPITATION,SNOW,AVG_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE
Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-11-29 18:00:00,16,11,2018,18,29,48,3,0.0,1.0,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 19:00:00,9,11,2018,19,29,48,3,0.0,1.0,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 20:00:00,2,11,2018,20,29,48,3,0.0,1.0,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 21:00:00,0,11,2018,21,29,48,3,0.0,1.0,10.51,0.0,0.0,61.0,76.0,47.0
2018-11-29 22:00:00,2,11,2018,22,29,48,3,0.0,1.0,10.51,0.0,0.0,61.0,76.0,47.0


In [66]:
y_train = train.pop('014706').values
X_train = train.values
y_test = test.pop('014706').values
X_test = test.values

In [67]:
train

Unnamed: 0_level_0,MONTH,YEAR,HOUR,DAY,WEEK,DAY_OF_WEEK,WEEKEND,WEEKDAY,AVG_DAILY_WIND_SPEED,PRECIPITATION,SNOW,AVG_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE
Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-07-15 00:00:00,7,2018,0,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 01:00:00,7,2018,1,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 02:00:00,7,2018,2,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 03:00:00,7,2018,3,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 04:00:00,7,2018,4,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 05:00:00,7,2018,5,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 06:00:00,7,2018,6,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 07:00:00,7,2018,7,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 08:00:00,7,2018,8,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0
2018-07-15 09:00:00,7,2018,9,15,28,6,1.0,0.0,10.29,0.0,0.0,86.0,98.0,75.0


In [81]:
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)
print("Random Forest score with 10 estimators:", rf.score(X_test, y_test))

Random Forest score: -0.12202940374322856


In [86]:
rf_100 = RandomForestRegressor(n_estimators=100, max_features="auto",random_state=0)
rf_100.fit(X_train, y_train)
print("Random Forest score with 100 estimators:", rf_100.score(X_test, y_test))

Random Forest score: 0.10727349378855322


In [88]:
#Fit a Decision Tree model as comparison
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
print("Decision Tree:", dt.score(X_test, y_test))

Decision Tree: -1.4312494055621698


In [89]:
ada = AdaBoostRegressor(n_estimators=100)
ada.fit(X_train, y_train)
print("Ada Boost 100 estimators:", ada.score(X_test, y_test))

Ada Boost 100 estimators: -0.20642147832307933


In [90]:
gradient_boost = GradientBoostingClassifier(n_estimators=100)
gradient_boost.fit(X_train, y_train)
print("Gradient Boost 100 estimators:", gradient_boost.score(X_test, y_test))

Gradient Boost 100 estimators: 0.41960093896713613


In [101]:
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)
print("Random Forest score with 10 estimators:", rf.score(X_test, y_test))
y_pred = rf.predict(X_test)
print("Mean Square Error:", mean_squared_error(y_test, y_pred))


rf_100 = RandomForestRegressor(n_estimators=100, max_features="auto",random_state=0)
rf_100.fit(X_train, y_train)
print("Random Forest score with 100 estimators:", rf_100.score(X_test, y_test))
y_pred = rf_100.predict(X_test)
print("Mean Square Error:", mean_squared_error(y_test, y_pred))

#Fit a Decision Tree model as comparison
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
print("Decision Tree:", dt.score(X_test, y_test))
y_pred = dt.predict(X_test)
print("Mean Square Error:", mean_squared_error(y_test, y_pred))

ada = AdaBoostRegressor(n_estimators=100)
ada.fit(X_train, y_train)
print("Ada Boost 100 estimators:", ada.score(X_test, y_test))
y_pred = ada.predict(X_test)
print("Mean Square Error:", mean_squared_error(y_test, y_pred))

gradient_boost = GradientBoostingClassifier(n_estimators=100)
gradient_boost.fit(X_train, y_train)
print("Gradient Boost 100 estimators:", gradient_boost.score(X_test, y_test))
y_pred = gradient_boost.predict(X_test)
print("Mean Square Error:", mean_squared_error(y_test, y_pred))


Random Forest score with 10 estimators: 0.013133119132654025
Mean Square Error: 12.246361502347419
Random Forest score with 100 estimators: 0.10727349378855322
Mean Square Error: 11.078142077464788
Decision Tree: -1.4986869012309438
Mean Square Error: 31.007042253521128
Ada Boost 100 estimators: -0.09100375889061074
Mean Square Error: 13.538630884088061
Gradient Boost 100 estimators: 0.4242957746478873
Mean Square Error: 30.45011737089202


In [94]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3306 entries, 2018-07-15 00:00:00 to 2018-11-29 17:00:00
Data columns (total 14 columns):
MONTH                   3306 non-null int64
YEAR                    3306 non-null int64
HOUR                    3306 non-null int64
DAY                     3306 non-null int64
WEEK                    3306 non-null int64
DAY_OF_WEEK             3306 non-null int64
WEEKEND                 3306 non-null float64
WEEKDAY                 3306 non-null float64
AVG_DAILY_WIND_SPEED    3306 non-null float64
PRECIPITATION           3306 non-null float64
SNOW                    3306 non-null float64
AVG_TEMPERATURE         3306 non-null float64
MAX_TEMPERATURE         3306 non-null float64
MIN_TEMPERATURE         3306 non-null float64
dtypes: float64(8), int64(6)
memory usage: 387.4 KB


In [95]:
gradient_boost.feature_importances_

array([0.00579186, 0.        , 0.49579729, 0.08679623, 0.06769141,
       0.04167363, 0.00728383, 0.00503964, 0.06601608, 0.04659596,
       0.        , 0.04287712, 0.0663573 , 0.06807964])

In [96]:
gradient_boost.verbose

0

In [97]:
gradient_boost = GradientBoostingClassifier(n_estimators=100)
gradient_boost.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [105]:

# Extract single tree
estimator = rf_100[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = ['MONTH', 'YEAR', 'HOUR', 'DAY', 'WEEK', 'DAY_OF_WEEK', 'WEEKEND',
       'WEEKDAY', 'AVG_DAILY_WIND_SPEED', 'PRECIPITATION', 'SNOW',
       'AVG_TEMPERATURE', 'MAX_TEMPERATURE', 'MIN_TEMPERATURE'],
                class_names = "output",
                rounded = True, proportion = False, 
                precision = 2, filled = True)



In [108]:
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz

estimator = rf_100[5]

dot_data = export_graphviz(estimator,
                           filled=True, 
                           rounded=True,
                           class_names=['Out'],
                           feature_names = ['MONTH', 'YEAR', 'HOUR', 'DAY', 'WEEK', 'DAY_OF_WEEK', 'WEEKEND',
       'WEEKDAY', 'AVG_DAILY_WIND_SPEED', 'PRECIPITATION', 'SNOW',
       'AVG_TEMPERATURE', 'MAX_TEMPERATURE', 'MIN_TEMPERATURE'],
                           out_file=None) 
graph = graph_from_dot_data(dot_data) 
graph.write_png('tree.png') 

True

In [109]:
rf_100.feature_importances_

array([0.01342757, 0.        , 0.59679626, 0.05741041, 0.04575851,
       0.07642321, 0.00373876, 0.00275543, 0.04705632, 0.03196791,
       0.        , 0.02864752, 0.05407512, 0.041943  ])

In [111]:
print(rf_100.get_params())

{'bootstrap': True, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': 1, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


In [112]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [113]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    
    
    
    print('Model Performance')
    print("Mean Square Error:", mean_squared_error(test_labels, predictions))
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

print("Base Model:")
rf_100 = RandomForestRegressor(n_estimators=100, max_features="auto",random_state=0)
rf_100.fit(X_train, y_train)
# print("Random Forest score with 100 estimators:", rf_100.score(X_test, y_test))
# y_pred = rf_100.predict(X_test)
# print("Mean Square Error:", mean_squared_error(y_test, y_pred))


# base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
# base_model.fit(train_features, train_labels)
base_accuracy = evaluate(rf_100, X_test, y_test)


Base Model:
Model Performance
Average Error: 2.0873 degrees.
Accuracy = nan%.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
