In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics

import joblib 
import os
import shutil

In [72]:
def get_split(df_line):

    # shuffle data frame code from 'sample_solution_COMP47350_Task2_PredictiveModeling_Evaluation_CreditRiskPrediction.ipynb'
    # randomly generate sequence based on dataframe index and set to be new index
    df_line.set_index(np.random.permutation(df_line.index))
    # sort the resulting random index
    df_line.sort_index(inplace=True)

    # drop unneeded columns
    df_line.drop(columns=['LINEID'], inplace=True)

    X = df_line.drop(columns=["TRIPTIME"], axis=1)
    y = pd.DataFrame(df_line['TRIPTIME'])

    # do test train split
    # Split the dataset into two datasets, 70% training and 30% test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=1)

    return X_train, y_train, X_test, y_train


In [74]:
def train_test_dicts(df_dir):
    # create directory with all the lines of a direction
    lines_dir = sorted(list(df_dir['LINEID'].unique()))
    
    # Make dictionaries with keys for each line with each train/test split value as its values.
    X_train_dict = {}
    y_train_dict = {}
    X_test_dict = {}
    y_test_dict = {}

    for line in lines_dir:

        # dataframe containing only rows with current lineid
        df_line = df_dir[df_dir['LINEID'] == line]

        # check for df with low values (where accurate predictions will not be possible as any one value will have too much influence on the overall model outcome)
        if df_line.shape[0] < 3:
            print('Not enough values for line', str(line))
            lines_dir.remove(line)

        else:
            split = get_split(df_line)

            X_train_dict[line] = split[0]
            y_train_dict[line] = split[1]
            X_test_dict[line] = split[2]
            y_test_dict[line] = split[3]

    return lines_dir, X_train_dict, y_train_dict, X_test_dict, y_test_dict


In [75]:
def get_rfr_model_dict(direction, lines_dir, X_train_dict, y_train_dict, X_test_dict, y_test_dict):
    
    rf_model_dict = {}

    # figure out how to deal with no assigned value for x_train, y_train
    X_train = X_train_dict['1']
    y_train = y_train_dict['1']
    X_test = X_test_dict['1']
    y_test = y_test_dict['1']
    
    for line in lines_dir:
        print(f'Line {line}')

        X_train = X_train_dict[line]
        y_train = y_train_dict[line]
        X_test = X_test_dict[line]
        y_test = y_test_dict[line]
    
        rfr = RandomForestRegressor(oob_score=True, random_state=1, max_depth=20, n_estimators=20)
        result = rfr.fit(X_train, y_train)
        rf_model_dict[line] = result

        # code from https://stackoverflow.com/questions/11660605/how-to-overwrite-a-folder-if-it-already-exists-when-creating-it-with-makedirs

        dir = f'/Users/rebeccadillon/git/dublin-bus-team-5/machinelearning/data/modelling/randomforest/joblibfiles/line_{line}_model/dir{direction}'
        if os.path.exists(dir):
            shutil.rmtree(dir)
        os.makedirs(dir)

        # save the model to a joblib file
        filename = f'/Users/rebeccadillon/git/dublin-bus-team-5/machinelearning/data/modelling/randomforest/joblibfiles/line_{line}_model/dir{direction}/line_{line}_rfr.joblib'
        joblib.dump(result, open(filename, 'wb'))

    return rf_model_dict

In [76]:
def get_metric_lists(direction, lines_dir, rf_model_dict, X_train_dict, y_train_dict, X_test_dict, y_test_dict):
    # sum for averages
    train_mae_sum = 0
    train_mape_sum = 0
    train_mse_sum = 0
    train_r2_sum = 0

    test_mae_sum = 0
    test_mape_sum = 0
    test_mse_sum = 0
    test_r2_sum = 0

    train_metric_list = []
    test_metric_list = []

    for line in lines_dir:  

        # training data
        X_train = X_train_dict[line]
        y_train = y_train_dict[line]    
        # test data
        X_test = X_test_dict[line]
        y_test = y_test_dict[line]
        rfr = rf_model_dict[line]
                
        print('Now modelling for line', str(line))
        
        rfr_predictions_train = list(rfr.predict(X_train))
        rfr_predictions_test = list(rfr.predict(X_test))

        # choice of metrics from https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56 and
        # https://towardsdatascience.com/random-forest-regression-5f605132d19d
        train_mae = metrics.mean_absolute_error(y_train, rfr_predictions_train) 
        train_mape = metrics.mean_absolute_percentage_error(y_train, rfr_predictions_train) # should be as close to 0 as possible (percentage error)
        train_mse = metrics.mean_squared_error(y_train, rfr_predictions_train) # as close to 0 as possible
        train_r2 = metrics.r2_score(y_train, rfr_predictions_train) #close to 1
        
        test_mae = metrics.mean_absolute_error(y_test, rfr_predictions_test) 
        test_mape = metrics.mean_absolute_percentage_error(y_test, rfr_predictions_test) 
        test_mse = metrics.mean_squared_error(y_test, rfr_predictions_test)
        test_r2 = metrics.r2_score(y_test, rfr_predictions_test)


        with open(f'/Users/rebeccadillon/git/dublin-bus-team-5/machinelearning/data/modelling/randomforest/joblibfiles/line_{line}_model/dir{direction}/line_{line}_rfr_metrics.csv', 'w') as file:
            file.write(f'\nTrain metrics for line {line}:'
                    f'\nMAE: {train_mae}'+\
                    f'\nMAPE: {train_mape}'+\
                    f'\nMSE: {train_mse}'+\
                    f'\nR2: {train_r2}' +\
                    f'\nTest metrics for line {line}:'
                    f'\nMAE: {test_mae}'+\
                    f'\nMAPE: {test_mape}'+\
                    f'\nMSE: {test_mse}'+\
                    f'\nR2: {test_r2}')

        # sum for averages
        train_mae_sum +=train_mae
        train_mape_sum +=train_mape
        train_mse_sum +=train_mse
        train_r2_sum +=train_r2

        test_mae_sum += test_mae
        test_mape_sum += test_mape
        test_mse_sum += test_mse
        test_r2_sum += test_r2

    train_metric_list.append([train_mae_sum, train_mape_sum, train_mse_sum, train_r2_sum])
    test_metric_list.append([test_mae_sum, test_mape_sum, test_mse_sum, test_r2_sum])

    return train_metric_list, test_metric_list

In [77]:
def print_metrics(lines_dir, train_metric_list, test_metric_list):
    print("============Train metrics=============")
    print("Mean MAE:", str(train_metric_list[0]/len(lines_dir)))
    print("Mean MAPE:", str(train_metric_list[1]/len(lines_dir)))
    print("Mean MSE:", str(train_metric_list[2]/len(lines_dir)))
    print("Mean R2:", str(train_metric_list[3]/len(lines_dir)))

    print("============Test metrics=============")
    print("Mean MAE:", str(test_metric_list[0]/len(lines_dir)))
    print("Mean MAPE:", str(test_metric_list[1]/len(lines_dir)))
    print("Mean MSE:", str(test_metric_list[2]/len(lines_dir)))
    print("Mean R2:", str(test_metric_list[3]/len(lines_dir)))


In [78]:
df_dir_1 = pd.read_csv(
    '/Users/rebeccadillon/git/dublin-bus-team-5/machinelearning/data/modelling/modelling_ready_dir1.csv')

In [79]:
df_dir_1 = df_dir_1.drop(columns=['heavy_precip', 'weather_id', 'HOUR'])


**train_test_to_dict(df_dir_1)**    
* return lines_dir, X_train_dict, y_train_dict, X_test_dict, y_test_dict


In [80]:
return_func_1 = train_test_dicts(df_dir_1)

Not enough values for line 41D


In [81]:
lines_dir = return_func_1[0]
X_train_dict = return_func_1[1]
y_train_dict = return_func_1[2]
X_test_dict = return_func_1[3]
y_test_dict = return_func_1[4]

def rfr_model_creator(direction, lines_dir, X_train_dict, y_train_dict, X_test_dict, y_test_dict):
    return rf_model_dict

In [82]:
X_train_dict

{'1':         humidity  wind_speed  PLANNEDTIME_DEP  WEEKDAY  MONTH
 298545        63         4.1          62400.0        1     10
 105874        93         6.7          44400.0        1      4
 168956        73         3.1          60000.0        1      6
 151230        63         7.2          73800.0        5      5
 302971        82         3.1          66600.0        5     10
 ...          ...         ...              ...      ...    ...
 56371         60        10.8          46800.0        4      2
 318040        76         6.7          62400.0        6     11
 246687        59         6.7          47400.0        6      9
 14435         93         3.6          50400.0        3      1
 316121        93         2.1          76800.0        3     11
 
 [4144 rows x 5 columns],
 '102':         humidity  wind_speed  PLANNEDTIME_DEP  WEEKDAY  MONTH
 40575         86         8.2          33600.0        0      2
 294813        87         2.1          28200.0        5     10
 322117        

In [83]:
return_func_2 = get_rfr_model_dict(1,lines_dir, X_train_dict, y_train_dict, X_test_dict, y_test_dict)

Line 1
        humidity  wind_speed  PLANNEDTIME_DEP  WEEKDAY  MONTH
298545        63         4.1          62400.0        1     10
105874        93         6.7          44400.0        1      4
168956        73         3.1          60000.0        1      6
151230        63         7.2          73800.0        5      5
302971        82         3.1          66600.0        5     10
...          ...         ...              ...      ...    ...
56371         60        10.8          46800.0        4      2
318040        76         6.7          62400.0        6     11
246687        59         6.7          47400.0        6      9
14435         93         3.6          50400.0        3      1
316121        93         2.1          76800.0        3     11

[4144 rows x 5 columns]
Line 102
        humidity  wind_speed  PLANNEDTIME_DEP  WEEKDAY  MONTH
298545        63         4.1          62400.0        1     10
105874        93         6.7          44400.0        1      4
168956        73         3.1 

In [84]:
rf_model_dict = return_func_2

def metric_writer(direction, lines_dir, rf_model_dict, X_train_dict, y_train_dict, X_test_dict, y_test_dict):
    return train_metric_list, test_metric_list

In [85]:
metric_list = get_metric_lists(1, lines_dir, rf_model_dict, X_train_dict, y_train_dict, X_test_dict, y_test_dict)

Now modelling for line 1


ValueError: Found input variables with inconsistent numbers of samples: [4144, 1777]

In [67]:
metric_list

([[14246.313677435515,
   5.218479330016565,
   3663849.9744228986,
   116.5086499510046]],
 [[32252.297503796086,
   12.089609267486242,
   16781873.613603,
   93.41447542545883]])

In [68]:
train_metric_list = metric_list[0][0]
test_metric_list = metric_list[1][0]

In [70]:
test_metric_list

[32252.297503796086, 12.089609267486242, 16781873.613603, 93.41447542545883]

In [71]:
print_metrics(lines_dir, train_metric_list, test_metric_list)

Mean MAE: 115.82368843443508
Mean MAPE: 0.04242666121964687
Mean MSE: 29787.398166039828
Mean R2: 0.9472247963496309
Mean MAE: 262.2138008438706
Mean MAPE: 0.09828950623972554
Mean MSE: 136437.99685856097
Mean R2: 0.7594672798817791
