In [229]:
import pandas as pd
import pymysql
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import datetime
import calendar

### Data Cleaning

In [230]:
con = pymysql.connect('localhost', 'root', 
    'password', 'secret_summer_project_data')
    

In [231]:
query = "SELECT DAYOFSERVICE,ROUTEID,ACTUALTIME_ARR,ACTUALTIME_DEP FROM rt_trips WHERE ROUTEID = '68_80'AND ACTUALTIME_DEP is not null AND ACTUALTIME_ARR is not null;"
route_df = pd.read_sql(query, con)

In [232]:
query = "SELECT DISTINCT(weather_main) FROM weather_2018;"
weather_types = pd.read_sql(query, con)

In [233]:
weather_types['weather_main']

0       Rain
1     Clouds
2    Drizzle
3      Clear
4        Fog
5       Mist
6       Snow
7      Smoke
Name: weather_main, dtype: object

In [234]:
route_df

Unnamed: 0,DAYOFSERVICE,ROUTEID,ACTUALTIME_ARR,ACTUALTIME_DEP
0,07-FEB-18 00:00:00,68_80,87524.0,84600.0
1,18-FEB-18 00:00:00,68_80,43313.0,40528.0
2,11-MAR-18 00:00:00,68_80,52646.0,49245.0
3,11-MAR-18 00:00:00,68_80,50895.0,47191.0
4,11-MAR-18 00:00:00,68_80,53992.0,50391.0
...,...,...,...,...
841,27-MAY-18 00:00:00,68_80,59851.0,56718.0
842,27-MAY-18 00:00:00,68_80,65704.0,62726.0
843,27-MAY-18 00:00:00,68_80,87483.0,84587.0
844,27-MAY-18 00:00:00,68_80,43902.0,40600.0


In [235]:
route_df['DAYOFSERVICE']= pd.to_datetime(route_df['DAYOFSERVICE'])

In [236]:
route_df

Unnamed: 0,DAYOFSERVICE,ROUTEID,ACTUALTIME_ARR,ACTUALTIME_DEP
0,2018-02-07,68_80,87524.0,84600.0
1,2018-02-18,68_80,43313.0,40528.0
2,2018-03-11,68_80,52646.0,49245.0
3,2018-03-11,68_80,50895.0,47191.0
4,2018-03-11,68_80,53992.0,50391.0
...,...,...,...,...
841,2018-05-27,68_80,59851.0,56718.0
842,2018-05-27,68_80,65704.0,62726.0
843,2018-05-27,68_80,87483.0,84587.0
844,2018-05-27,68_80,43902.0,40600.0


In [237]:
route_df['DAYOFSERVICE'][0]

Timestamp('2018-02-07 00:00:00')

Getting weather df date column to match returned route df date column

In [238]:
query = "SELECT temp,wind_speed,weather_main,date,hour FROM weather_2018"
weather_df = pd.read_sql(query, con)

In [239]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8800 entries, 0 to 8799
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   temp          8800 non-null   float64
 1   wind_speed    8800 non-null   float64
 2   weather_main  8800 non-null   object 
 3   date          8800 non-null   object 
 4   hour          8800 non-null   object 
dtypes: float64(2), object(3)
memory usage: 343.9+ KB


In [240]:
weather_df['date']= pd.to_datetime(weather_df['date'])

In [241]:
weather_df['date'].iloc[0]

Timestamp('2018-01-01 00:00:00')

Getting route df time column to match weather df time

In [242]:
route_df['hour'] = ((route_df['ACTUALTIME_DEP']/60/60).astype('int')).astype('str').str.zfill(2)

In [243]:
route_df

Unnamed: 0,DAYOFSERVICE,ROUTEID,ACTUALTIME_ARR,ACTUALTIME_DEP,hour
0,2018-02-07,68_80,87524.0,84600.0,23
1,2018-02-18,68_80,43313.0,40528.0,11
2,2018-03-11,68_80,52646.0,49245.0,13
3,2018-03-11,68_80,50895.0,47191.0,13
4,2018-03-11,68_80,53992.0,50391.0,13
...,...,...,...,...,...
841,2018-05-27,68_80,59851.0,56718.0,15
842,2018-05-27,68_80,65704.0,62726.0,17
843,2018-05-27,68_80,87483.0,84587.0,23
844,2018-05-27,68_80,43902.0,40600.0,11


In [244]:
weather_df

Unnamed: 0,temp,wind_speed,weather_main,date,hour
0,4.15,12.86,Rain,2018-01-01,00
1,4.14,11.83,Rain,2018-01-01,01
2,4.61,12.35,Clouds,2018-01-01,02
3,4.64,12.35,Clouds,2018-01-01,03
4,5.04,11.83,Clouds,2018-01-01,04
...,...,...,...,...,...
8795,9.65,4.10,Clouds,2018-12-31,19
8796,9.27,4.10,Clouds,2018-12-31,20
8797,9.31,5.10,Clouds,2018-12-31,21
8798,9.19,5.70,Clouds,2018-12-31,22


joining weather df to route df

In [245]:
res = route_df.merge(weather_df, how='inner', left_on=['DAYOFSERVICE', 'hour'], right_on=['date', 'hour'])

In [246]:
res

Unnamed: 0,DAYOFSERVICE,ROUTEID,ACTUALTIME_ARR,ACTUALTIME_DEP,hour,temp,wind_speed,weather_main,date
0,2018-02-07,68_80,87524.0,84600.0,23,5.44,7.20,Clouds,2018-02-07
1,2018-02-18,68_80,43313.0,40528.0,11,7.52,3.09,Rain,2018-02-18
2,2018-03-11,68_80,52646.0,49245.0,13,8.02,4.63,Clouds,2018-03-11
3,2018-03-11,68_80,50895.0,47191.0,13,8.02,4.63,Clouds,2018-03-11
4,2018-03-11,68_80,53992.0,50391.0,13,8.02,4.63,Clouds,2018-03-11
...,...,...,...,...,...,...,...,...,...
848,2018-05-27,68_80,59851.0,56718.0,15,14.17,5.10,Clouds,2018-05-27
849,2018-05-27,68_80,65704.0,62726.0,17,16.55,3.60,Clouds,2018-05-27
850,2018-05-27,68_80,87483.0,84587.0,23,12.93,1.03,Clouds,2018-05-27
851,2018-05-27,68_80,43902.0,40600.0,11,13.99,5.14,Clouds,2018-05-27


In [247]:
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 853 entries, 0 to 852
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   DAYOFSERVICE    853 non-null    datetime64[ns]
 1   ROUTEID         853 non-null    object        
 2   ACTUALTIME_ARR  853 non-null    float64       
 3   ACTUALTIME_DEP  853 non-null    float64       
 4   hour            853 non-null    object        
 5   temp            853 non-null    float64       
 6   wind_speed      853 non-null    float64       
 7   weather_main    853 non-null    object        
 8   date            853 non-null    datetime64[ns]
dtypes: datetime64[ns](2), float64(4), object(3)
memory usage: 66.6+ KB


In [248]:
res['journey_time'] = res['ACTUALTIME_ARR'] - res['ACTUALTIME_DEP']

In [249]:
res

Unnamed: 0,DAYOFSERVICE,ROUTEID,ACTUALTIME_ARR,ACTUALTIME_DEP,hour,temp,wind_speed,weather_main,date,journey_time
0,2018-02-07,68_80,87524.0,84600.0,23,5.44,7.20,Clouds,2018-02-07,2924.0
1,2018-02-18,68_80,43313.0,40528.0,11,7.52,3.09,Rain,2018-02-18,2785.0
2,2018-03-11,68_80,52646.0,49245.0,13,8.02,4.63,Clouds,2018-03-11,3401.0
3,2018-03-11,68_80,50895.0,47191.0,13,8.02,4.63,Clouds,2018-03-11,3704.0
4,2018-03-11,68_80,53992.0,50391.0,13,8.02,4.63,Clouds,2018-03-11,3601.0
...,...,...,...,...,...,...,...,...,...,...
848,2018-05-27,68_80,59851.0,56718.0,15,14.17,5.10,Clouds,2018-05-27,3133.0
849,2018-05-27,68_80,65704.0,62726.0,17,16.55,3.60,Clouds,2018-05-27,2978.0
850,2018-05-27,68_80,87483.0,84587.0,23,12.93,1.03,Clouds,2018-05-27,2896.0
851,2018-05-27,68_80,43902.0,40600.0,11,13.99,5.14,Clouds,2018-05-27,3302.0


In [250]:
res['weekday'] = res['date'].dt.day_name()

In [251]:
res

Unnamed: 0,DAYOFSERVICE,ROUTEID,ACTUALTIME_ARR,ACTUALTIME_DEP,hour,temp,wind_speed,weather_main,date,journey_time,weekday
0,2018-02-07,68_80,87524.0,84600.0,23,5.44,7.20,Clouds,2018-02-07,2924.0,Wednesday
1,2018-02-18,68_80,43313.0,40528.0,11,7.52,3.09,Rain,2018-02-18,2785.0,Sunday
2,2018-03-11,68_80,52646.0,49245.0,13,8.02,4.63,Clouds,2018-03-11,3401.0,Sunday
3,2018-03-11,68_80,50895.0,47191.0,13,8.02,4.63,Clouds,2018-03-11,3704.0,Sunday
4,2018-03-11,68_80,53992.0,50391.0,13,8.02,4.63,Clouds,2018-03-11,3601.0,Sunday
...,...,...,...,...,...,...,...,...,...,...,...
848,2018-05-27,68_80,59851.0,56718.0,15,14.17,5.10,Clouds,2018-05-27,3133.0,Sunday
849,2018-05-27,68_80,65704.0,62726.0,17,16.55,3.60,Clouds,2018-05-27,2978.0,Sunday
850,2018-05-27,68_80,87483.0,84587.0,23,12.93,1.03,Clouds,2018-05-27,2896.0,Sunday
851,2018-05-27,68_80,43902.0,40600.0,11,13.99,5.14,Clouds,2018-05-27,3302.0,Sunday


In [252]:
# dropping unnecessary columns for ML model and duplicated columns
res.drop('DAYOFSERVICE',1,inplace=True)
res.drop('date',1,inplace=True)
res.drop('ROUTEID',1,inplace=True)

In [253]:
res.drop('ACTUALTIME_ARR',1,inplace=True)

In [254]:
res.drop('hour',1,inplace=True)

In [255]:
res

Unnamed: 0,ACTUALTIME_DEP,temp,wind_speed,weather_main,journey_time,weekday
0,84600.0,5.44,7.20,Clouds,2924.0,Wednesday
1,40528.0,7.52,3.09,Rain,2785.0,Sunday
2,49245.0,8.02,4.63,Clouds,3401.0,Sunday
3,47191.0,8.02,4.63,Clouds,3704.0,Sunday
4,50391.0,8.02,4.63,Clouds,3601.0,Sunday
...,...,...,...,...,...,...
848,56718.0,14.17,5.10,Clouds,3133.0,Sunday
849,62726.0,16.55,3.60,Clouds,2978.0,Sunday
850,84587.0,12.93,1.03,Clouds,2896.0,Sunday
851,40600.0,13.99,5.14,Clouds,3302.0,Sunday


In [256]:
for weather in list(weather_df['weather_main']):
    if weather not in res['weather_main'].unique():
        added_column = res.iloc[0]
        added_column['weather_main'] = weather
        res = res.append(added_column,ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [257]:
res

Unnamed: 0,ACTUALTIME_DEP,temp,wind_speed,weather_main,journey_time,weekday
0,84600.0,5.44,7.20,Clouds,2924.0,Wednesday
1,40528.0,7.52,3.09,Rain,2785.0,Sunday
2,49245.0,8.02,4.63,Clouds,3401.0,Sunday
3,47191.0,8.02,4.63,Clouds,3704.0,Sunday
4,50391.0,8.02,4.63,Clouds,3601.0,Sunday
...,...,...,...,...,...,...
849,62726.0,16.55,3.60,Clouds,2978.0,Sunday
850,84587.0,12.93,1.03,Clouds,2896.0,Sunday
851,40600.0,13.99,5.14,Clouds,3302.0,Sunday
852,84591.0,8.09,5.66,Clouds,2622.0,Tuesday


In [258]:
missing_weather

[]

In [191]:
missing_days = []
for day in list(calendar.day_name):
    if day not in res['weekday'].unique():
        added_column = res.iloc[0]
        added_column['weekday'] = day
        res = res.append(added_column,ignore_index=True)

In [192]:
missing_days

[]

In [None]:
for weather in missing_weather:
    

In [161]:
res

Unnamed: 0,ACTUALTIME_DEP,temp,wind_speed,weather_main,journey_time,weekday,Clouds,Drizzle,Fog,Mist,Rain,Snow,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Smoke
0,84600.0,5.44,7.20,Clouds,2924.0,Wednesday,1,0,0,0,0,0,0,0,0,0,0,1,0
1,40528.0,7.52,3.09,Rain,2785.0,Sunday,0,0,0,0,1,0,0,0,1,0,0,0,0
2,49245.0,8.02,4.63,Clouds,3401.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0,0
3,47191.0,8.02,4.63,Clouds,3704.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0,0
4,50391.0,8.02,4.63,Clouds,3601.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,56718.0,14.17,5.10,Clouds,3133.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0,0
849,62726.0,16.55,3.60,Clouds,2978.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0,0
850,84587.0,12.93,1.03,Clouds,2896.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0,0
851,40600.0,13.99,5.14,Clouds,3302.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0,0


In [124]:
# getting dummies for object columns

res = pd.concat([res, pd.get_dummies(res['weather_main'],drop_first=True)], axis=1)

In [125]:
res = pd.concat([res, pd.get_dummies(res['weekday'],drop_first=True)], axis=1)

In [126]:
res

Unnamed: 0,ACTUALTIME_DEP,temp,wind_speed,weather_main,journey_time,weekday,Clouds,Drizzle,Fog,Mist,Rain,Snow,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,84600.0,5.44,7.20,Clouds,2924.0,Wednesday,1,0,0,0,0,0,0,0,0,0,0,1
1,40528.0,7.52,3.09,Rain,2785.0,Sunday,0,0,0,0,1,0,0,0,1,0,0,0
2,49245.0,8.02,4.63,Clouds,3401.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0
3,47191.0,8.02,4.63,Clouds,3704.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0
4,50391.0,8.02,4.63,Clouds,3601.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,56718.0,14.17,5.10,Clouds,3133.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0
849,62726.0,16.55,3.60,Clouds,2978.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0
850,84587.0,12.93,1.03,Clouds,2896.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0
851,40600.0,13.99,5.14,Clouds,3302.0,Sunday,1,0,0,0,0,0,0,0,1,0,0,0


In [127]:
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 853 entries, 0 to 852
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ACTUALTIME_DEP  853 non-null    float64
 1   temp            853 non-null    float64
 2   wind_speed      853 non-null    float64
 3   weather_main    853 non-null    object 
 4   journey_time    853 non-null    float64
 5   weekday         853 non-null    object 
 6   Clouds          853 non-null    uint8  
 7   Drizzle         853 non-null    uint8  
 8   Fog             853 non-null    uint8  
 9   Mist            853 non-null    uint8  
 10  Rain            853 non-null    uint8  
 11  Snow            853 non-null    uint8  
 12  Monday          853 non-null    uint8  
 13  Saturday        853 non-null    uint8  
 14  Sunday          853 non-null    uint8  
 15  Thursday        853 non-null    uint8  
 16  Tuesday         853 non-null    uint8  
 17  Wednesday       853 non-null    uin

In [35]:
# now dropping columns which we created dummies for 

res.drop('weather_main',1,inplace=True)
res.drop('weekday',1,inplace=True)

In [36]:
res

Unnamed: 0,ACTUALTIME_DEP,temp,wind_speed,journey_time,Clouds,Drizzle,Fog,Mist,Rain,Snow,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,84600.0,5.44,7.20,2924.0,1,0,0,0,0,0,0,0,0,0,0,1
1,40528.0,7.52,3.09,2785.0,0,0,0,0,1,0,0,0,1,0,0,0
2,49245.0,8.02,4.63,3401.0,1,0,0,0,0,0,0,0,1,0,0,0
3,47191.0,8.02,4.63,3704.0,1,0,0,0,0,0,0,0,1,0,0,0
4,50391.0,8.02,4.63,3601.0,1,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,56718.0,14.17,5.10,3133.0,1,0,0,0,0,0,0,0,1,0,0,0
849,62726.0,16.55,3.60,2978.0,1,0,0,0,0,0,0,0,1,0,0,0
850,84587.0,12.93,1.03,2896.0,1,0,0,0,0,0,0,0,1,0,0,0
851,40600.0,13.99,5.14,3302.0,1,0,0,0,0,0,0,0,1,0,0,0


In [37]:
# y is the target
y = res["journey_time"]
# X is everything else
X = res.drop(["journey_time"],1)
# Split the dataset into two datasets: 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.3)

### Linear Regression

In [38]:
mlr = LinearRegression().fit(X_train, y_train)

In [39]:
# first I will print the intercept
print("Intercept = ",mlr.intercept_)
print()

# next I will print each feature individually with it's associated coeficient 
print("\u0332".join("Features and Coeficients:"))
for feat,coef in list(zip(X_train.columns, mlr.coef_)):
    print(feat," = ",coef)

Intercept =  3710.5426155143846

F̲e̲a̲t̲u̲r̲e̲s̲ ̲a̲n̲d̲ ̲C̲o̲e̲f̲i̲c̲i̲e̲n̲t̲s̲:
ACTUALTIME_DEP  =  -0.006693640822934606
temp  =  -0.2780427923057895
wind_speed  =  10.842979579110434
Clouds  =  195.04944234535125
Drizzle  =  186.39805923697057
Fog  =  245.91267337929358
Mist  =  1.7053025658242404e-13
Rain  =  179.57744623842848
Snow  =  -55.335740080546756
Monday  =  -468.1309503522405
Saturday  =  142.22370582578412
Sunday  =  -235.8969060447619
Thursday  =  -242.82075140862787
Tuesday  =  -379.8162949997406
Wednesday  =  -269.1487169338931


In [40]:
# get predictions
mlr_test = (mlr.predict(X_test))

In [41]:
act_vs_pred_mlr = pd.DataFrame(data={'y_test':y_test,'result': mlr_test})

In [42]:
act_vs_pred_mlr

Unnamed: 0,y_test,result
711,3154.0,3398.593414
247,2851.0,2924.957410
480,2843.0,3461.721677
467,3070.0,3097.932256
320,3745.0,3239.365176
...,...,...
479,3300.0,3472.879365
349,2654.0,2909.640846
246,3916.0,3615.918156
255,3030.0,3509.302173


In [43]:
print("MAE: ", metrics.mean_absolute_error(y_test, mlr_test))
#print("MSE: ", metrics.mean_squared_error(testActualVal, predictions))
print("RMSE: ", metrics.mean_squared_error(y_test, mlr_test)**0.5)
print("R2: ", metrics.r2_score(y_test, mlr_test))

MAE:  304.9789438128288
RMSE:  377.4157220113722
R2:  0.1605729918557447


In [44]:
# comparing my model's metrics to always predicting the average journey time

y_test_mean = []
for i in range(len(y_test)):
    y_test_mean.append(y_test.mean())

In [45]:
print("MAE: ", metrics.mean_absolute_error(y_test, y_test_mean))
print("RMSE: ", metrics.mean_squared_error(y_test, y_test_mean)**0.5)
print("R2: ", metrics.r2_score(y_test, y_test_mean))

MAE:  336.8839416503906
RMSE:  411.93483416910584
R2:  0.0


cross validating 

In [46]:
print("MAE: ", cross_val_score(LinearRegression(),X, y, cv=10,scoring='neg_mean_absolute_error').mean()*-1)
print("RMSE: ", cross_val_score(LinearRegression(),X, y, cv=10,scoring='neg_root_mean_squared_error').mean()*-1)
print("R2: ", cross_val_score(LinearRegression(),X, y, cv=10,scoring='r2').mean())

MAE:  304.82354167099413
RMSE:  383.4181202193312
R2:  0.13811128117880964


### Random Forest Regressor

In [47]:
r_forest_reg = RandomForestRegressor(n_estimators=100)
r_forest_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [48]:
r_forest_predictions = r_forest_reg.predict(X_test)

In [49]:
act_vs_pred_rforest = pd.DataFrame(data={'y_test':y_test,'result': r_forest_predictions})

In [50]:
act_vs_pred_rforest

Unnamed: 0,y_test,result
711,3154.0,3102.81
247,2851.0,3040.91
480,2843.0,3424.82
467,3070.0,3444.45
320,3745.0,3974.14
...,...,...
479,3300.0,3088.81
349,2654.0,2567.78
246,3916.0,3817.64
255,3030.0,3225.38


In [51]:
print("MAE: ", metrics.mean_absolute_error(y_test, r_forest_predictions))
print("RMSE: ", metrics.mean_squared_error(y_test, r_forest_predictions)**0.5)
print("R2: ", metrics.r2_score(y_test, r_forest_predictions))

MAE:  235.21707031250003
RMSE:  305.60995752667014
R2:  0.4496005843860491


In [52]:
print("MAE: ", cross_val_score(RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True), X, y, scoring='neg_mean_absolute_error', cv=10).mean()*-1)
print("RMSE: ", cross_val_score(RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True), X, y, scoring='neg_root_mean_squared_error', cv=10).mean()*-1)
print("R2: ", cross_val_score(RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True), X, y, scoring='r2', cv=10).mean())


MAE:  252.61706922024624
RMSE:  326.5206815044065
R2:  0.36281765146200107


### Gradient Boosting

In [53]:
gb = GradientBoostingRegressor(n_estimators=100)
gb.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [54]:
gb_predictions = gb.predict(X_test)

In [55]:
act_vs_pred_gb = pd.DataFrame(data={'y_test':y_test,'result': gb_predictions})

In [56]:
act_vs_pred_gb

Unnamed: 0,y_test,result
711,3154.0,3138.971497
247,2851.0,3096.104098
480,2843.0,3211.890095
467,3070.0,3347.634211
320,3745.0,4083.277553
...,...,...
479,3300.0,3041.790445
349,2654.0,2757.513328
246,3916.0,3707.261686
255,3030.0,3222.997752


In [57]:
print("MAE: ", metrics.mean_absolute_error(y_test, gb_predictions))
print("RMSE: ", metrics.mean_squared_error(y_test, gb_predictions)**0.5)
print("R2: ", metrics.r2_score(y_test, gb_predictions))

MAE:  228.9290161709713
RMSE:  297.9099343319988
R2:  0.4769864571057735


In [58]:
print("MAE: ", cross_val_score(GradientBoostingRegressor(n_estimators=100), X, y, scoring='neg_mean_absolute_error', cv=10).mean()*-1)
print("RMSE: ", cross_val_score(GradientBoostingRegressor(n_estimators=100), X, y, scoring='neg_root_mean_squared_error', cv=10).mean()*-1)
print("R2: ", cross_val_score(GradientBoostingRegressor(n_estimators=100), X, y, scoring='r2', cv=10).mean())


MAE:  245.82196540067034
RMSE:  312.9873963296551
R2:  0.4031317867303487


In [59]:
importance = pd.DataFrame({'feature': X_train.columns, 'importance':gb.feature_importances_})
importance = importance.sort_values('importance', ascending=False)
importance.set_index('feature',1, inplace=True)

In [60]:
importance

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
ACTUALTIME_DEP,0.682115
temp,0.077526
Saturday,0.056753
Monday,0.054357
wind_speed,0.047248
Tuesday,0.027526
Rain,0.016143
Sunday,0.013966
Wednesday,0.009812
Snow,0.005584


TypeError: 'int' object is not subscriptable