# Prediction Model with Random Forest

In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
# Allows plots to appear directly in the notebook.
%matplotlib inline
# filter warnings, we may want them but for now I removed them
import warnings
warnings.filterwarnings('ignore')

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics

### Converting Datatypes and Preparing Features

In [2]:
# df equals df_availability_and_weather_all_features in V3 linear regression notebook
df= pd.read_csv("stations_dataframe_all_features_index.csv")
df.head(5)

Unnamed: 0,stationId,totalStands,availableBikes,freeStands,lastUpdate,temperature,cloudiness,windSpeed,rain,snow,...,mainWeather_Drizzle,mainWeather_Fog,mainWeather_Mist,mainWeather_Rain,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,2,20,10,10,2021-02-27 17:35:14,9.34,75.0,2.57,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,2,20,10,10,2021-02-27 17:45:23,9.2,75.0,1.03,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
2,2,20,10,10,2021-02-27 17:55:32,9.48,75.0,1.03,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
3,2,20,11,9,2021-02-27 18:00:42,9.22,75.0,1.03,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
4,2,20,11,9,2021-02-27 18:10:51,8.85,75.0,0.51,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488289 entries, 0 to 488288
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   stationId            488289 non-null  int64  
 1   totalStands          488289 non-null  int64  
 2   availableBikes       488289 non-null  int64  
 3   freeStands           488289 non-null  int64  
 4   lastUpdate           488289 non-null  object 
 5   temperature          488289 non-null  float64
 6   cloudiness           488289 non-null  float64
 7   windSpeed            488289 non-null  float64
 8   rain                 488289 non-null  float64
 9   snow                 488289 non-null  float64
 10  fullHour             488289 non-null  int64  
 11  hour_sin             488289 non-null  float64
 12  hour_cos             488289 non-null  float64
 13  mainWeather_Clouds   488289 non-null  int64  
 14  mainWeather_Drizzle  488289 non-null  int64  
 15  mainWeather_Fog  

In [4]:
# Convert datatypes
# stationId to object
df['stationId'] = df['stationId'].astype('object')

# lastUpdate to datetime
df['lastUpdate']= pd.to_datetime(df['lastUpdate'])

In [6]:
# get all station Ids in a list
station_id_list = df['stationId'].unique().tolist()
print(station_id_list)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117]


In [8]:
# Saving features to train model
all_features = list(df.drop(['availableBikes', 'lastUpdate', 'stationId','totalStands','freeStands'], axis = 1).columns)
all_features

['temperature',
 'cloudiness',
 'windSpeed',
 'rain',
 'snow',
 'fullHour',
 'hour_sin',
 'hour_cos',
 'mainWeather_Clouds',
 'mainWeather_Drizzle',
 'mainWeather_Fog',
 'mainWeather_Mist',
 'mainWeather_Rain',
 'weekday_Monday',
 'weekday_Saturday',
 'weekday_Sunday',
 'weekday_Thursday',
 'weekday_Tuesday',
 'weekday_Wednesday']

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488289 entries, 0 to 488288
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   stationId            488289 non-null  object        
 1   totalStands          488289 non-null  int64         
 2   availableBikes       488289 non-null  int64         
 3   freeStands           488289 non-null  int64         
 4   lastUpdate           488289 non-null  datetime64[ns]
 5   temperature          488289 non-null  float64       
 6   cloudiness           488289 non-null  float64       
 7   windSpeed            488289 non-null  float64       
 8   rain                 488289 non-null  float64       
 9   snow                 488289 non-null  float64       
 10  fullHour             488289 non-null  int64         
 11  hour_sin             488289 non-null  float64       
 12  hour_cos             488289 non-null  float64       
 13  mainWeather_Cl

### Training and Testing Sets

In [10]:
# returns a dataframe with the rows that have the station_id as value in stationId column
def filter_station_data(df, station_id):
    station_df = df.loc[(df['stationId'] == station_id)]
    return station_df

In [11]:
# Our data should already be ordered by default, but just to make sure this function orders everyhting by lastUpdate
def order_df_by_lastUpdate(station_df):
    station_df_time_ordered = station_df.sort_values(['lastUpdate'])
    # need to reset index after sorting
    station_df_time_ordered_reset = station_df_time_ordered.reset_index(drop=True)
    return station_df_time_ordered_reset

In [12]:
# separate descriptive features and target feature
# returns two dataframes 
def get_station_features_and_target(station_dataframe_ordered, features):
    X = station_dataframe_ordered[features]
    y = station_dataframe_ordered.availableBikes
    # print("\nDescriptive features in X:\n", X)
    # print("\nTarget feature in y:\n", y)
    return X, y

**This function has changed**

In [15]:
# this function fits a random forest model to the training data
# returns the model learned
def apply_model_to_training_data(X_training_data, y_training_data):
    # Instantiate model with 100 decision trees
    model = RandomForestRegressor(n_estimators = 100).fit(X_training_data, y_training_data)
    return model

***The output of this function does not include coefficients and intercepts because this is not a linear model***

In [16]:
# this function splits the station data into training and test for 'n_splits' times and trains a model for each split
# returns a dictionary with number of the split as key and the training and test data and model learned as values
def split_time_series_data_and_train_model_for_each_split(X, y, number_splits, number_test_size, number_gap):
    splits = {}
    for i in range(0, number_splits):
        splits[f'split_{i+1}'] = {}
    
    # we can set different test sizes and splits to see the difference in the outcome if we want
        # we can also add a gap between train and test data
    # test_size defaults to: n_samples // (n_splits + 1) --> so if we want at least 20% of data for test
        # n_splits should be max 4 and test_size=None
    tscv = TimeSeriesSplit(n_splits=number_splits, test_size=number_test_size, gap=number_gap)
    
    split_index = 1
    
    # split data 'n_splits' times
    for train_index, test_index in tscv.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        
        # separate data in training and test data for both descriptive features and target feature
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # store data of the current split in 'splits' dictionary
        splits[f'split_{split_index}']['X_train'] = X_train
        splits[f'split_{split_index}']['X_test'] = X_test
        splits[f'split_{split_index}']['y_train'] = y_train
        splits[f'split_{split_index}']['y_test'] = y_test
        
        # fit the model for the current split on training data
        model_for_split = apply_model_to_training_data(X_train, y_train)
        
        # store the model learned for the current split data in 'splits' dictionary
        splits[f'split_{split_index}']['Model'] = model_for_split
        split_index += 1
        
    #print('splits:', splits)
    return splits

In [17]:
def train_model_per_station(df, station_id_list, features, number_splits, number_test_size, number_gap):
   
    # initialise a dict that will store the station models for each split for all stations
    stations_splits_and_models = {}
    for station_id in station_id_list:
        # create a key with each station Id and assign another dict as a value that will hold the features, splits, and models learned
        stations_splits_and_models[station_id] = {'Features': features}

        # get rows with stationId == station_id separated by descriptive features and target feature
        df_station_data = filter_station_data(df, station_id)

        # get station data ordered by lastUpdate to make sure data is sequential timewise
        df_station_time_ordered = order_df_by_lastUpdate(df_station_data)

        # separate descriptive features from target feature
        X, y = get_station_features_and_target(df_station_time_ordered, features)

        # split data into test and training data using TimeSeriesSplit and train the model for each split     
        splits = split_time_series_data_and_train_model_for_each_split(X, y, number_splits, number_test_size, number_gap)

        for key in splits.keys():
            stations_splits_and_models[station_id][key] = {}

            split_model = splits[key]['Model']
            split_X_train = splits[key]['X_train']
            split_X_test = splits[key]['X_test']
            split_y_train = splits[key]['y_train']
            split_y_test = splits[key]['y_test']
            
            # store data in each split
            stations_splits_and_models[station_id][key]['X_train'] = split_X_train
            stations_splits_and_models[station_id][key]['X_test'] = split_X_test
            stations_splits_and_models[station_id][key]['y_train'] = split_y_train
            stations_splits_and_models[station_id][key]['y_test'] = split_y_test
            
            # store model learned for each split for the current station in 'stations_splits_and_models' dict
            stations_splits_and_models[station_id][key]['Model'] = split_model
                                                         
    # return features, splits data and all models learned for each split for every station in a dict
    return stations_splits_and_models

*Get the trained models for each station*

In [18]:
stations_splits_and_models = train_model_per_station(df, station_id_list, all_features, number_splits=4, number_test_size=None, number_gap=0)

In [19]:
# this dict has the features, splits and models learned for each split for every station
# stations_splits_and_models
n = 1
first_n_pairs = list(stations_splits_and_models.items())[:n]
for key,value in first_n_pairs:
    print(key, ':', value)

2 : {'Features': ['temperature', 'cloudiness', 'windSpeed', 'rain', 'snow', 'fullHour', 'hour_sin', 'hour_cos', 'mainWeather_Clouds', 'mainWeather_Drizzle', 'mainWeather_Fog', 'mainWeather_Mist', 'mainWeather_Rain', 'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday', 'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday'], 'split_1': {'X_train':      temperature  cloudiness  windSpeed  rain  snow  fullHour  hour_sin  \
0           9.34        75.0       2.57  0.00   0.0        17 -0.997669   
1           9.20        75.0       1.03  0.00   0.0        17 -0.997669   
2           9.48        75.0       1.03  0.00   0.0        17 -0.997669   
3           9.22        75.0       1.03  0.00   0.0        18 -0.979084   
4           8.85        75.0       0.51  0.00   0.0        18 -0.979084   
..           ...         ...        ...   ...   ...       ...       ...   
872         5.75        75.0       3.60  0.00   0.0         0  0.000000   
873         5.74        75.0       3.60  0.2

*Get the predictions (for each split) and metrics (for each split and the average over all splits) for every station*

In [20]:
# function to predict target feature on data
def get_predictions(model_for_station, data):
    data_converted_2D_array = data.to_numpy()
    predictions = model_for_station.predict(data_converted_2D_array)
    return predictions

In [21]:
# get predicted availableBikes values for each split for each station
# accepts data as argument --> training data or test data
def get_predictions_and_metrics_for_each_split_all_stations(stations_splits_and_models_dict, X_train_or_test_data, y_train_or_test_data):
    
    if X_train_or_test_data == 'X_train' and y_train_or_test_data == 'y_train':
        X_data = 'X_train'
        y_data = 'y_train'
    elif X_train_or_test_data == 'X_test' and y_train_or_test_data == 'y_test':
        X_data = 'X_test'
        y_data = 'y_test'
        
    station_predictions = {}

    stations_ids = list(stations_splits_and_models_dict.keys())
    for station_id in stations_ids:
        station_predictions[station_id] = {}
        split_keys = list(stations_splits_and_models_dict[station_id].keys())

        # for each station get each split and the model learned for each split and predict on each split
        for i in range(1, len(split_keys)):
            station_predictions[station_id][split_keys[i]] = {}
            
            X_split = stations_splits_and_models_dict[station_id][split_keys[i]][X_data]
            model_for_X_split = stations_splits_and_models_dict[station_id][split_keys[i]]['Model']
            y_split = stations_splits_and_models_dict[station_id][split_keys[i]][y_data]
            predictions_on_X_split = get_predictions(model_for_X_split, X_split)

            # add data and predictions for each split to the dict
            station_predictions[station_id][split_keys[i]][X_data] = X_split
            station_predictions[station_id][split_keys[i]][y_data] = y_split
            station_predictions[station_id][split_keys[i]]['availability_predictions'] = predictions_on_X_split
            
            # get metrics of predictions
            mae, mse, rmse, r2 = calculate_metrics_of_prediction(y_split, predictions_on_X_split)
            station_predictions[station_id][split_keys[i]]['Metrics'] = {}
            station_predictions[station_id][split_keys[i]]['Metrics']['MAE'] = mae
            station_predictions[station_id][split_keys[i]]['Metrics']['MSE'] = mse
            station_predictions[station_id][split_keys[i]]['Metrics']['RMSE'] = rmse
            station_predictions[station_id][split_keys[i]]['Metrics']['R2'] = r2
    
    stations_predictions_with_avg_metrics = add_avg_metrics_over_splits_to_station_dict(station_predictions)
            
    return stations_predictions_with_avg_metrics

In [22]:
def calculate_metrics_of_prediction(y_data_actual_values, availability_predictions):
    mae = metrics.mean_absolute_error(y_data_actual_values, availability_predictions)
    mse = metrics.mean_squared_error(y_data_actual_values, availability_predictions)
    rmse = metrics.mean_squared_error(y_data_actual_values, availability_predictions)**0.5
    r2 = metrics.r2_score(y_data_actual_values, availability_predictions)
    return mae, mse, rmse, r2

In [23]:
def get_average_metric_value_over_splits(values_list):
    sum_values = 0
    for value in values_list:
            sum_values += value
    avg_value = sum_values/len(values_list)
    return avg_value

In [24]:
def add_avg_metrics_over_splits_to_station_dict(predictions_dict):
    
    for station_key in predictions_dict.keys():
        mae_values = []
        mse_values = []
        rmse_values = []
        r2_values = []
            
        for split_key in predictions_dict[station_key].keys():
            split_metrics_dict = predictions_dict[station_key][split_key]['Metrics']
             
            for metric_key in split_metrics_dict.keys():
                if metric_key == 'MAE':
                    mae_values.append(split_metrics_dict[metric_key])
                if metric_key == 'MSE':
                    mse_values.append(split_metrics_dict[metric_key])
                if metric_key == 'RMSE':
                    rmse_values.append(split_metrics_dict[metric_key])
                elif metric_key == 'R2':
                    r2_values.append(split_metrics_dict[metric_key])
        
        
        avg_mae = get_average_metric_value_over_splits(mae_values)
        avg_mse = get_average_metric_value_over_splits(mse_values)
        avg_rmse = get_average_metric_value_over_splits(rmse_values)
        avg_r2 = get_average_metric_value_over_splits(r2_values)
        
        predictions_dict[station_key]['AVG Metrics'] = {'AVG MAE': avg_mae, 'AVG MSE': avg_mse,
                                                        'AVG RMSE': avg_rmse, 'AVG R2': avg_r2}
    return predictions_dict

In [25]:
# get predicted availability values and metrics on training data for all stations for each data split
predictions_training_data = get_predictions_and_metrics_for_each_split_all_stations(stations_splits_and_models, 'X_train', 'y_train')
# this dict has the predictions for each station for each train data split, the metrics per split
    # and the avg across all splits
# predictions_training_data

In [26]:
n = 1
first_n_pairs = list(predictions_training_data.items())[:n]
for key,value in first_n_pairs:
    print(key, ':', value)

2 : {'split_1': {'X_train':      temperature  cloudiness  windSpeed  rain  snow  fullHour  hour_sin  \
0           9.34        75.0       2.57  0.00   0.0        17 -0.997669   
1           9.20        75.0       1.03  0.00   0.0        17 -0.997669   
2           9.48        75.0       1.03  0.00   0.0        17 -0.997669   
3           9.22        75.0       1.03  0.00   0.0        18 -0.979084   
4           8.85        75.0       0.51  0.00   0.0        18 -0.979084   
..           ...         ...        ...   ...   ...       ...       ...   
872         5.75        75.0       3.60  0.00   0.0         0  0.000000   
873         5.74        75.0       3.60  0.26   0.0         1  0.269797   
874         5.72        75.0       3.60  0.11   0.0         1  0.269797   
875         5.82        75.0       3.09  1.33   0.0         1  0.269797   
876         5.68        75.0       3.09  1.13   0.0         1  0.269797   

     hour_cos  mainWeather_Clouds  mainWeather_Drizzle  mainWeather_Fog

In [27]:
# get predicted availability values and metrics on test data for all stations for each data split
predictions_test_data = get_predictions_and_metrics_for_each_split_all_stations(stations_splits_and_models, 'X_test', 'y_test')
# this dict has the predictions for each station for each test data split, the metrics per split
    # and the avg across all splits
# predictions_test_data

In [28]:
n = 1
first_n_pairs = list(predictions_test_data.items())[:n]
for key,value in first_n_pairs:
    print(key, ':', value)

2 : {'split_1': {'X_test':       temperature  cloudiness  windSpeed  rain  snow  fullHour  hour_sin  \
877          5.68        75.0       3.09  0.65   0.0         1  0.269797   
878          5.68        75.0       2.57  0.24   0.0         1  0.269797   
879          5.68        75.0       2.57  0.00   0.0         2  0.519584   
880          5.68        75.0       2.57  0.00   0.0         2  0.519584   
881          5.77        75.0       3.09  0.00   0.0         2  0.519584   
...           ...         ...        ...   ...   ...       ...       ...   
1747         5.05        75.0      10.29  0.42   0.0        21 -0.519584   
1748         5.05        75.0      10.29  0.49   0.0        21 -0.519584   
1749         4.98        75.0       7.72  1.00   0.0        21 -0.519584   
1750         4.98        75.0       7.72  2.00   0.0        21 -0.519584   
1751         5.27        75.0      10.29  0.20   0.0        21 -0.519584   

      hour_cos  mainWeather_Clouds  mainWeather_Drizzle  mai