In [135]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from patsy import dmatrices

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz

import graphviz
from graphviz import Source

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

import pickle
import os 
import shutil


code adapted from https://github.com/jamie-reynolds-UCD/UCD-Dublin-Bus-App-Team9/blob/main/data/General_Files_Linreg/general_LinRegModelling.ipynb

In [136]:
df_dir_1 = pd.read_csv('/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/modelling_prep/feature_pairwise_cleaned_dir1.csv')
df_dir_2 = pd.read_csv('/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/modelling_prep/feature_pairwise_cleaned_dir2.csv')

In [111]:
print("The shape of the direction 1 dataframe is:", df_dir_1.shape)
print("The shape of the direction 2 dataframe is:", df_dir_2.shape)

The shape of the direction 1 dataframe is: (354155, 13)
The shape of the direction 2 dataframe is: (352480, 12)


In [112]:
print("The datatypes in the direction 1 dataframe is:")
print(df_dir_1.dtypes)
print("The datatypes in the direction 2 dataframe is:")
print(df_dir_2.dtypes)

The datatypes in the direction 1 dataframe is:
temp            float64
humidity          int64
wind_speed      float64
weather_id        int64
heavy_precip      int64
HOUR              int64
TRIPID            int64
LINEID           object
ROUTEID          object
DIRECTION         int64
TRIPTIME        float64
WEEKDAY           int64
MONTH             int64
dtype: object
The datatypes in the direction 2 dataframe is:
humidity          int64
wind_speed      float64
weather_id        int64
heavy_precip      int64
HOUR              int64
TRIPID            int64
LINEID           object
ROUTEID          object
DIRECTION         int64
TRIPTIME        float64
WEEKDAY           int64
MONTH             int64
dtype: object


### Direction 1
Remembering from <i>feature_pairwise_interactions.ipynb</i> the following:<br>
* categorical_med_info_gain = ['MONTH','heavy_precip','weather_id']
* categorical_high_info_gain = ['HOUR','WEEKDAY']
<br>

Tried first with all features, low accuracy (R2 less than 0.2 in most cases).

In [113]:
df_dir_1 = df_dir_1.drop(columns=['heavy_precip','weather_id'])

In [103]:
lines_dir_1 = sorted(list(df_dir_1['LINEID'].unique()))
lines_dir_1

['1',
 '102',
 '104',
 '11',
 '111',
 '114',
 '116',
 '120',
 '122',
 '123',
 '13',
 '130',
 '14',
 '140',
 '142',
 '145',
 '14C',
 '15',
 '150',
 '151',
 '15A',
 '15B',
 '15D',
 '16',
 '161',
 '16C',
 '16D',
 '17',
 '17A',
 '18',
 '184',
 '185',
 '220',
 '236',
 '238',
 '239',
 '25',
 '25A',
 '25B',
 '25D',
 '25X',
 '26',
 '27',
 '270',
 '27A',
 '27B',
 '27X',
 '29A',
 '31',
 '31A',
 '31B',
 '31D',
 '32',
 '32X',
 '33',
 '33A',
 '33B',
 '33D',
 '33E',
 '33X',
 '37',
 '38',
 '38A',
 '38B',
 '38D',
 '39',
 '39A',
 '39X',
 '4',
 '40',
 '40B',
 '40D',
 '40E',
 '41',
 '41B',
 '41C',
 '41D',
 '41X',
 '42',
 '42D',
 '43',
 '44',
 '44B',
 '45A',
 '46A',
 '47',
 '49',
 '51D',
 '53',
 '54A',
 '56A',
 '59',
 '61',
 '63',
 '65',
 '65B',
 '66',
 '66A',
 '66B',
 '66X',
 '67',
 '67X',
 '68',
 '68A',
 '69',
 '69X',
 '7',
 '70',
 '70D',
 '75',
 '76',
 '76A',
 '77A',
 '79',
 '79A',
 '7A',
 '7B',
 '7D',
 '83',
 '83A',
 '84',
 '84A',
 '84X',
 '9']

In [104]:
# Make dictionaries with keys for each stop with each train/test split value as its values.

X_train_dict_1 = {}
y_train_dict_1 = {}
X_test_dict_1 ={}
y_test_dict_1 = {}

for line in lines_dir_1:
    
    df_line = df_dir_1[df_dir_1['LINEID']==line]
    
    #check for df with low values
    if df_line.shape[0] < 3:
        print('Not enough values for line', str(line))
        lines_dir_1.remove(line)

    else:
        # randomly generate sequence based on dataframe index and set to be new index
        df_line.set_index(np.random.permutation(df_line.index))
        # sort the resulting random index
        df_line.sort_index(inplace=True)

        
        # drop unneeded columns
        df_line.drop(columns=['TRIPID', 'ROUTEID', 'DIRECTION','LINEID'], inplace=True)

        X = df_line.drop(["TRIPTIME"],1)
        y = pd.DataFrame(df_line['TRIPTIME'])

        # do test train split
        # Split the dataset into two datasets: 70% training and 30% test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

        X_train_dict_1[line] = X_train
        y_train_dict_1[line] = y_train
        X_test_dict_1[line] = X_test
        y_test_dict_1[line] = y_test
        
        # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
        X_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)


Not enough values for line 41D


## Linear Regression

In [None]:
model_dict_1 = {}

for line in lines_dir_1:
    print(f'Line {line}')
    
    X_train_dict_1[line] = X_train
    y_train_dict_1[line] = y_train
    X_test_dict_1[line] = X_test
    y_test_dict_1[line] = y_test

    linReg = LinearRegression().fit(X_train, y_train)
    
    model_dict_1[line] = linReg

    # code from https://stackoverflow.com/questions/11660605/how-to-overwrite-a-folder-if-it-already-exists-when-creating-it-with-makedirs

    dir = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/linearregression/picklefiles/line_{line}_model/dir1'
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.makedirs(dir)
    
    filename = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/linearregression/picklefiles/line_{line}_model/dir1/line_{line}_linreg.sav'
    pickle.dump(linReg, open(filename, 'wb'))

In [None]:
for line in lines_dir_1:  

    # training data
    X_train = X_train_dict_1[line]
    y_train = y_train_dict_1[line]
    linReg = model_dict_1[line]
    
    # test data
    X_test = X_test_dict_1[line]
    y_test = y_test_dict_1[line]
    linReg = model_dict_1[line]
            
    print('Now modelling for line', str(line))
    
    linReg_predictions_train = list(linReg.predict(X_train))

    # train metrics
    train_mae = metrics.mean_absolute_error(y_train, linReg_predictions_train)
    train_mape = metrics.mean_absolute_percentage_error(y_train, linReg_predictions_train)
    train_mse = metrics.mean_squared_error(y_train, linReg_predictions_train)
    train_rmse = metrics.mean_squared_error(y_train, linReg_predictions_train)**(0.5)
    train_r2 = metrics.r2_score(y_train, linReg_predictions_train)

    linReg_predictions_test = list(linReg.predict(X_test))

    # test metrics
    test_mae = metrics.mean_absolute_error(y_test, linReg_predictions_test)
    test_mape = metrics.mean_absolute_percentage_error(y_test, linReg_predictions_test)
    test_mse = metrics.mean_squared_error(y_test, linReg_predictions_test)
    test_rmse = metrics.mean_squared_error(y_test, linReg_predictions_test)**(0.5)
    test_r2 = metrics.r2_score(y_test, linReg_predictions_test)
    
    with open(f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/linearregression/picklefiles/line_{line}_model/dir1/line_{line}_linreg_metrics.csv', 'w') as file:
        file.write(f'\nTrain metrics for line {line}:'
                f'\nMAE: {train_mae}' +\
                f'\nMAPE: {train_mape}'+\
                f'\nMSE: {train_mse}'+\
                f'\nRMSE: {train_rmse**(0.5)}'+\
                f'\nR2: {train_r2}'+\
                f'\nTest metrics for line {line}:'
                f'\nMAE: {test_mae}' +\
                f'\nMAPE: {test_mape}'+\
                f'\nMSE: {test_mse}'+\
                f'\nRMSE: {test_rmse**(0.5)}'+\
                f'\nR2: {test_r2}')

In [90]:
len(lines_dir_1)

123

## Random Forest

In [146]:
rf_model_dict_1 = {}

for line in lines_dir_1:
    print(f'Line {line}')
    
    X_train_dict_1[line] = X_train
    y_train_dict_1[line] = y_train
    X_test_dict_1[line] = X_test
    y_test_dict_1[line] = y_test

    rfr = RandomForestRegressor(n_estimators=50, max_features='auto', oob_score=True, random_state=1)
    result = rfr.fit(X_train, y_train)
    rf_model_dict_1[line] = rfr

    # code from https://stackoverflow.com/questions/11660605/how-to-overwrite-a-folder-if-it-already-exists-when-creating-it-with-makedirs

    dir = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/randomforest/picklefiles/line_{line}_model/dir1'
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.makedirs(dir)
    
    filename = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/randomforest/picklefiles/line_{line}_model/dir1/line_{line}_rfr.sav'
    pickle.dump(rfr, open(filename, 'wb'))

Line 1
Line 102
Line 104
Line 11
Line 111
Line 114
Line 116
Line 120
Line 122
Line 123
Line 13
Line 130
Line 14
Line 140
Line 142
Line 145
Line 14C
Line 15
Line 150
Line 151
Line 15A
Line 15B
Line 15D
Line 16
Line 161
Line 16C
Line 16D
Line 17
Line 17A
Line 18
Line 184
Line 185
Line 220
Line 236
Line 238
Line 239
Line 25
Line 25A
Line 25B
Line 25D
Line 25X
Line 26
Line 27
Line 270
Line 27A
Line 27B
Line 27X
Line 29A
Line 31
Line 31A
Line 31B
Line 31D
Line 32
Line 32X
Line 33
Line 33A
Line 33B
Line 33D
Line 33E
Line 33X
Line 37
Line 38
Line 38A
Line 38B
Line 38D
Line 39
Line 39A
Line 39X
Line 4
Line 40
Line 40B
Line 40D
Line 40E
Line 41
Line 41B
Line 41C
Line 41X
Line 42
Line 42D
Line 43
Line 44
Line 44B
Line 45A
Line 46A
Line 47
Line 49
Line 51D
Line 53
Line 54A
Line 56A
Line 59
Line 61
Line 63
Line 65
Line 65B
Line 66
Line 66A
Line 66B
Line 66X
Line 67
Line 67X
Line 68
Line 68A
Line 69
Line 69X
Line 7
Line 70
Line 70D
Line 75
Line 76
Line 76A
Line 77A
Line 79
Line 79A
Line 7A
Line 7B


In [148]:
for line in lines_dir_1:  

    # training data
    X_train = X_train_dict_1[line]
    y_train = y_train_dict_1[line]
    rfr = rf_model_dict_1[line]
    
    # test data
    X_test = X_test_dict_1[line]
    y_test = y_test_dict_1[line]
    rfr = rf_model_dict_1[line]
            
    print('Now modelling for line', str(line))
    
    rfr_predictions_train = list(rfr.predict(X_train))
    rfr_predictions_test = list(rfr.predict(X_test))

    with open(f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/randomforest/picklefiles/line_{line}_model/dir1/line_{line}_rfr_metrics.csv', 'w') as file:
        file.write(f'\nMetrics for line {line}:'
                f'\nMAE: {metrics.mean_absolute_error(y_train, rfr_predictions_train)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_train, rfr_predictions_train)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_train, rfr_predictions_train)}')

Now modelling for line 1
Now modelling for line 102
Now modelling for line 104
Now modelling for line 11
Now modelling for line 111
Now modelling for line 114
Now modelling for line 116
Now modelling for line 120
Now modelling for line 122
Now modelling for line 123
Now modelling for line 13
Now modelling for line 130
Now modelling for line 14
Now modelling for line 140
Now modelling for line 142
Now modelling for line 145
Now modelling for line 14C
Now modelling for line 15
Now modelling for line 150
Now modelling for line 151
Now modelling for line 15A
Now modelling for line 15B
Now modelling for line 15D
Now modelling for line 16
Now modelling for line 161
Now modelling for line 16C
Now modelling for line 16D
Now modelling for line 17
Now modelling for line 17A
Now modelling for line 18
Now modelling for line 184
Now modelling for line 185
Now modelling for line 220
Now modelling for line 236
Now modelling for line 238
Now modelling for line 239
Now modelling for line 25
Now modelli

In [149]:
importance = pd.DataFrame({'feature': X_train.columns, 'importance':rfr.feature_importances_})
importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
2,HOUR,0.578939
3,WEEKDAY,0.134704
1,wind_speed,0.103197
0,humidity,0.096059
4,MONTH,0.087101


At each adjustment checking these figures:
1. Will drop heavy_precip (0.001122) 
- model now at 0.85
2. Will drop weather_id (0.036889)
- model now at 0.858
3. Will now drop month (0.059965)
- r2 decreased to 0.857

In [150]:
def cross_val_RandomForest_DF(X,y, depth=None, estimators=100):
    """Function to perform cross validation and store results 
    in dataframe. Cross validation looks at accuracy, precision, 
    recall, f1. Returns a dataframe with results"""

    # store results in dict
    RandomForestResults = {}
    # metrics to test against
    test_metrics = ['accuracy','precision','recall', 'f1']

    for metric in test_metrics:
        # generate test results
        result = cross_val_score(RandomForestRegressor(n_estimators=estimators, max_features='auto', oob_score=True, random_state=1, max_depth=depth), X, y, scoring=metric, cv=10)
        # store result in dict
        RandomForestResults[metric] = result.mean()
    
    # create dataframe with results
    RandomForestDF = pd.DataFrame.from_dict(RandomForestResults, orient='index', columns=['Random_Forests'])

    return RandomForestDF

In [151]:
RandomForestDF = cross_val_RandomForest_DF(X,y)
print(f"Mean results from 10 fold cross validation are:")
RandomForestDF

KeyboardInterrupt: 

In [None]:
# compute the out-of-bag classification accuracy
rfr.oob_score_

0.5993010498200189

# Direction 2
Remembering from <i>feature_pairwise_interactions.ipynb</i> the following:
* categorical_med_info_gain = ['heavy_precip','weather_id','weather_main']
* categorical_high_info_gain = ['HOUR','WEEKDAY','MONTH']

In [128]:
df_dir_2

Unnamed: 0,humidity,wind_speed,weather_id,heavy_precip,HOUR,TRIPID,LINEID,ROUTEID,DIRECTION,TRIPTIME,WEEKDAY,MONTH
0,75,5.10,500,0,5,5962263,40,40_31,2,4574.0,1,1
1,75,5.10,500,0,5,5963304,40,40_31,2,4106.0,1,1
2,81,3.10,500,0,6,5958381,15B,15B_61,2,3433.0,1,1
3,81,3.10,500,0,6,5971085,15B,15B_61,2,2976.0,1,1
4,81,3.10,500,0,6,5963272,7B,7B_93,2,4352.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
352475,79,0.89,803,0,21,8584834,145,145_105,2,4289.0,0,12
352476,80,1.79,803,0,22,8581813,40E,40E_91,2,1469.0,0,12
352477,80,1.79,803,0,22,8586917,27B,27B_34,2,2388.0,0,12
352478,80,1.79,803,0,22,8582081,32,32_58,2,2095.0,0,12


In [137]:
df_dir_2 = df_dir_2.drop(columns=['heavy_precip','weather_id'])

In [138]:
lines_dir_2 = sorted(list(df_dir_2['LINEID'].unique()))
lines_dir_2

['1',
 '102',
 '104',
 '11',
 '111',
 '114',
 '116',
 '118',
 '120',
 '122',
 '123',
 '13',
 '130',
 '14',
 '140',
 '142',
 '145',
 '14C',
 '15',
 '150',
 '151',
 '15A',
 '15B',
 '15D',
 '16',
 '161',
 '16C',
 '17',
 '17A',
 '18',
 '184',
 '185',
 '220',
 '236',
 '238',
 '239',
 '25',
 '25A',
 '25B',
 '25D',
 '25X',
 '26',
 '27',
 '270',
 '27A',
 '27B',
 '27X',
 '29A',
 '31',
 '31A',
 '31B',
 '31D',
 '32',
 '32X',
 '33',
 '33A',
 '33B',
 '33D',
 '33X',
 '37',
 '38',
 '38A',
 '38B',
 '38D',
 '39',
 '39A',
 '39X',
 '4',
 '40',
 '40B',
 '40D',
 '40E',
 '41',
 '41A',
 '41B',
 '41C',
 '41D',
 '41X',
 '42',
 '42D',
 '43',
 '44',
 '44B',
 '45A',
 '46A',
 '46E',
 '47',
 '49',
 '51D',
 '51X',
 '53',
 '54A',
 '56A',
 '59',
 '61',
 '63',
 '65',
 '65B',
 '66',
 '66A',
 '66B',
 '66X',
 '67',
 '67X',
 '68',
 '68A',
 '68X',
 '69',
 '69X',
 '7',
 '70',
 '70D',
 '75',
 '76',
 '76A',
 '77A',
 '77X',
 '79',
 '79A',
 '7A',
 '7B',
 '7D',
 '83',
 '83A',
 '84',
 '84A',
 '84X',
 '9']

In [139]:
# Make dictionaries with keys for each stop with each train/test split value as its values.

X_train_dict_2 = {}
y_train_dict_2 = {}
X_test_dict_2 ={}
y_test_dict_2 = {}

for line in lines_dir_2:
    
    df_line = df_dir_2[df_dir_2['LINEID']==line]
    
    #check for df with low values
    if df_line.shape[0] < 3:
        print('Not enough values for line', str(line))
        lines_dir_2.remove(line)

    else:
        # randomly generate sequence based on dataframe index and set to be new index
        df_line.set_index(np.random.permutation(df_line.index))
        # sort the resulting random index
        df_line.sort_index(inplace=True)

        
        # drop unneeded columns
        df_line.drop(columns=['TRIPID', 'ROUTEID', 'DIRECTION','LINEID'], inplace=True)

        X = df_line.drop(["TRIPTIME"],1)
        y = pd.DataFrame(df_line['TRIPTIME'])

        # do test train split
        # Split the dataset into two datasets: 70% training and 30% test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

        X_train_dict_2[line] = X_train
        y_train_dict_2[line] = y_train
        X_test_dict_2[line] = X_test
        y_test_dict_2[line] = y_test
        
        # need to reset the index to allow contatenation with predicted values otherwise not joining on same index...
        X_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)


Not enough values for line 41D


# Linear Regression

In [None]:
model_dict_2 = {}

for line in lines_dir_2:
    print(f'Line {line}')
    
    X_train_dict_2[line] = X_train
    y_train_dict_2[line] = y_train
    X_test_dict_2[line] = X_test
    y_test_dict_2[line] = y_test

    linReg = LinearRegression().fit(X_train, y_train)
    
    model_dict_2[line] = linReg

    # code from https://stackoverflow.com/questions/11660605/how-to-overwrite-a-folder-if-it-already-exists-when-creating-it-with-makedirs

    dir = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/linearregression/picklefiles/line_{line}_model/dir2'
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.makedirs(dir)
    
    filename = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/linearregression/picklefiles/line_{line}_model/dir2/line_{line}_linreg.sav'
    pickle.dump(linReg, open(filename, 'wb'))

In [None]:
for line in lines_dir_2:  

    # training data
    X_train = X_train_dict_2[line]
    y_train = y_train_dict_2[line]
    linReg = model_dict_2[line]
    
    # test data
    X_test = X_test_dict_2[line]
    y_test = y_test_dict_2[line]
    linReg = model_dict_2[line]
            
    print('Now modelling for line', str(line))
    
    linReg_predictions_train = list(linReg.predict(X_train))

    # train metrics
    train_mae = metrics.mean_absolute_error(y_train, linReg_predictions_train)
    train_mape = metrics.mean_absolute_percentage_error(y_train, linReg_predictions_train)
    train_mse = metrics.mean_squared_error(y_train, linReg_predictions_train)
    train_rmse = metrics.mean_squared_error(y_train, linReg_predictions_train)**(0.5)
    train_r2 = metrics.r2_score(y_train, linReg_predictions_train)

    linReg_predictions_test = list(linReg.predict(X_test))

    # test metrics
    test_mae = metrics.mean_absolute_error(y_test, linReg_predictions_test)
    test_mape = metrics.mean_absolute_percentage_error(y_test, linReg_predictions_test)
    test_mse = metrics.mean_squared_error(y_test, linReg_predictions_test)
    test_rmse = metrics.mean_squared_error(y_test, linReg_predictions_test)**(0.5)
    test_r2 = metrics.r2_score(y_test, linReg_predictions_test)
    
    with open(f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/linearregression/picklefiles/line_{line}_model/dir2/line_{line}_linreg_metrics.csv', 'w') as file:
        file.write(f'\nTrain metrics for line {line}:'
                f'\nMAE: {metrics.mean_absolute_error(y_train, linReg_predictions_train)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_train, linReg_predictions_train)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_train, linReg_predictions_train)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_train, linReg_predictions_train)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_train, linReg_predictions_train)}'+\
                f'\nTest metrics for line {line}:'
                f'\nMAE: {metrics.mean_absolute_error(y_test, linReg_predictions_test)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_test, linReg_predictions_test)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_test, linReg_predictions_test)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_test, linReg_predictions_test)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_test, linReg_predictions_test)}')

## Random Forest Regressor

In [140]:
rf_model_dict_2 = {}

for line in lines_dir_2:
    print(f'Line {line}')
    
    X_train_dict_2[line] = X_train
    y_train_dict_2[line] = y_train
    X_test_dict_2[line] = X_test
    y_test_dict_2[line] = y_test

    rfr = RandomForestRegressor(n_estimators=50, max_features='auto', oob_score=True, random_state=1)
    result = rfr.fit(X_train, y_train)
    rf_model_dict_2[line] = rfr

    # code from https://stackoverflow.com/questions/11660605/how-to-overwrite-a-folder-if-it-already-exists-when-creating-it-with-makedirs

    dir = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/randomforest/picklefiles/line_{line}_model/dir2'
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.makedirs(dir)
    
    filename = f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/randomforest/picklefiles/line_{line}_model/dir2/line_{line}_rfr.sav'
    pickle.dump(rfr, open(filename, 'wb'))

Line 1


In [141]:
for line in lines_dir_2:  

    # training data
    X_train = X_train_dict_2[line]
    y_train = y_train_dict_2[line]
    rfr = rf_model_dict_2[line]
    
    # test data
    X_test = X_test_dict_2[line]
    y_test = y_test_dict_2[line]
    rfr = rf_model_dict_2[line]
            
    print('Now modelling for line', str(line))
    
    rfr_predictions_train = list(rfr.predict(X_train))
    rfr_predictions_test = list(rfr.predict(X_test))

    with open(f'/Users/rebeccadillon/git/dublin-bus-team-5/data/modelling/randomforest/picklefiles/line_{line}_model/dir2/line_{line}_rfr_metrics.csv', 'w') as file:
        file.write(f'\nMetrics for line {line}:'
                f'\nMAE: {metrics.mean_absolute_error(y_train, rfr_predictions_train)}' +\
                f'\nMAPE: {metrics.mean_absolute_percentage_error(y_train, rfr_predictions_train)}'+\
                f'\nMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)}'+\
                f'\nRMSE: {metrics.mean_squared_error(y_train, rfr_predictions_train)**(0.5)}'+\
                f'\nR2: {metrics.r2_score(y_train, rfr_predictions_train)}')

Now modelling for line 1


In [142]:
importance = pd.DataFrame({'feature': X_train.columns, 'importance':rfr.feature_importances_})
importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
2,HOUR,0.578939
3,WEEKDAY,0.134704
1,wind_speed,0.103197
0,humidity,0.096059
4,MONTH,0.087101


model currently at 0.82
* drop heavy_precip (0.02)

model currently at 0.828
* drop weather_id (0.046414)

model currently at 0.827