# Functions to train and test models

In [1]:
import math
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics

*Get mean of available bikes per station per day for each hour to later compare if the model improves this baseline*

In [29]:
def get_available_bikes_mean_per_day_and_hour(station_id, full_hour_list, weekdays_list):
    available_bikes_mean_per_day_and_hour = {}
    for weekday in weekdays_list:
        available_bikes_mean_per_day_and_hour[weekday] = []
        for hour in full_hour_list:
            available_bikes_mean_per_hour_of_day = {}
            available_bikes_mean_per_hour_of_day[hour] = 0
            list_of_unique_availability_values = df_availability_and_weather.loc[(df_availability_and_weather['stationId'] == station_id) & 
                                                (df_availability_and_weather['weekday'] == weekday) & 
                                                (df_availability_and_weather['fullHour'] == hour)].availableBikes.value_counts().index.tolist()
            
            # get mean availablity for current hour being iterated
            sum_availability = 0
            for number in list_of_unique_availability_values:
                sum_availability += number
            try:
                mean_availability = sum_availability/len(list_of_unique_availability_values)
                available_bikes_mean_per_hour_of_day[hour] = mean_availability
            except ZeroDivisionError:
                print("Division by Zero Error:", station_id, weekday, hour)
                break
            
            # append dict {hour: mean_availability} as a value for each weekday key in the available_bikes_mean_per_day_and_hour dict
            available_bikes_mean_per_day_and_hour[weekday].append(available_bikes_mean_per_hour_of_day)
            
    return {station_id : available_bikes_mean_per_day_and_hour}

In [30]:
def get_station_mean_availability_per_day_and_hour(station_id_list, full_hour_list, weekdays_list):
    mean_availability = []
    for station_id in station_id_list:
        mean_availability.append(get_available_bikes_mean_per_day_and_hour(station_id, full_hour_list, weekdays_list))
    return mean_availability

### Training a model for each station for each split

*We are building a model per station so each station data needs to be filtered from the original CSV which has data on all stations*

In [48]:
# get station rows from original csv
# returns a dataframe with the rows that have the station_id as value in stationId column
def filter_station_data(df, station_id):
    station_df = df.loc[(df['stationId'] == station_id)]
    return station_df

In [49]:
# Our data should already be ordered by default, but just to make sure this function orders everyhting by lastUpdate
def order_df_by_lastUpdate(station_df):
    station_df_time_ordered = station_df.sort_values(['lastUpdate'])
    # need to reset index after sorting
    station_df_time_ordered_reset = station_df_time_ordered.reset_index(drop=True)
    return station_df_time_ordered_reset

In [1]:
# separate descriptive features and target feature
# returns two dataframes 
def get_station_features_and_target(station_dataframe_ordered, features):
    X = station_dataframe_ordered[features]
    y = station_dataframe_ordered.availableBikes
    # print("\nDescriptive features in X:\n", X)
    # print("\nTarget feature in y:\n", y)
    return X, y

In [52]:
# this function splits the station data into training and test for 'n_splits' times and trains a model for each split
# returns a dictionary with number of the split as key and the training and test data and model learned as values
def split_time_series_data_and_train_model_for_each_split(X, y, number_splits, number_test_size, number_gap):
    splits = {}
    for i in range(0, number_splits):
        splits[f'split_{i+1}'] = {}
    
    # we can set different test sizes and splits to see the difference in the outcome if we want
        # we can also add a gap between train and test data
    # test_size defaults to: n_samples // (n_splits + 1) --> so if we want at least 20% of data for test
        # n_splits should be max 4 and test_size=None
    tscv = TimeSeriesSplit(n_splits=number_splits, test_size=number_test_size, gap=number_gap)
    
    split_index = 1
    
    # split data 'n_splits' times
    for train_index, test_index in tscv.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)
        
        # separate data in training and test data for both descriptive features and target feature
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # store data of the current split in 'splits' dictionary
        splits[f'split_{split_index}']['X_train'] = X_train
        splits[f'split_{split_index}']['X_test'] = X_test
        splits[f'split_{split_index}']['y_train'] = y_train
        splits[f'split_{split_index}']['y_test'] = y_test
        
        # fit the model for the current split on training data
        model_for_split = apply_model_to_training_data(X_train, y_train)
        
        # store the model learned for the current split data in 'splits' dictionary
        splits[f'split_{split_index}']['Model'] = model_for_split
        split_index += 1
        
    #print('splits:', splits)
    return splits

*Get the predictions (for each split) and metrics (for each split and the average over all splits) for every station*

In [60]:
# function to predict target feature on data
def get_predictions(model_for_station, data):
    data_converted_2D_array = data.to_numpy()
    predictions = model_for_station.predict(data_converted_2D_array)
    return predictions

In [61]:
# get predicted availableBikes values for each split for each station
# accepts data as argument --> training data or test data
def get_predictions_and_metrics_for_each_split_all_stations(stations_splits_and_models_dict, X_train_or_test_data, y_train_or_test_data):
    
    if X_train_or_test_data == 'X_train' and y_train_or_test_data == 'y_train':
        X_data = 'X_train'
        y_data = 'y_train'
    elif X_train_or_test_data == 'X_test' and y_train_or_test_data == 'y_test':
        X_data = 'X_test'
        y_data = 'y_test'
        
    station_predictions = {}

    stations_ids = list(stations_splits_and_models_dict.keys())
    for station_id in stations_ids:
        station_predictions[station_id] = {}
        split_keys = list(stations_splits_and_models_dict[station_id].keys())

        # for each station get each split and the model learned for each split and predict on each split
        for i in range(1, len(split_keys)):
            station_predictions[station_id][split_keys[i]] = {}
            
            X_split = stations_splits_and_models_dict[station_id][split_keys[i]][X_data]
            model_for_X_split = stations_splits_and_models_dict[station_id][split_keys[i]]['Model']
            y_split = stations_splits_and_models_dict[station_id][split_keys[i]][y_data]
            predictions_on_X_split = get_predictions(model_for_X_split, X_split)

            # add data and predictions for each split to the dict
            station_predictions[station_id][split_keys[i]][X_data] = X_split
            station_predictions[station_id][split_keys[i]][y_data] = y_split
            station_predictions[station_id][split_keys[i]]['availability_predictions'] = predictions_on_X_split
            
            # get metrics of predictions
            mae, mse, rmse, r2 = calculate_metrics_of_prediction(y_split, predictions_on_X_split)
            station_predictions[station_id][split_keys[i]]['Metrics'] = {}
            station_predictions[station_id][split_keys[i]]['Metrics']['MAE'] = mae
            station_predictions[station_id][split_keys[i]]['Metrics']['MSE'] = mse
            station_predictions[station_id][split_keys[i]]['Metrics']['RMSE'] = rmse
            station_predictions[station_id][split_keys[i]]['Metrics']['R2'] = r2
    
    stations_predictions_with_avg_metrics = add_avg_metrics_over_splits_to_station_dict(station_predictions)
            
    return stations_predictions_with_avg_metrics

In [62]:
def calculate_metrics_of_prediction(y_data_actual_values, availability_predictions):
    mae = metrics.mean_absolute_error(y_data_actual_values, availability_predictions)
    mse = metrics.mean_squared_error(y_data_actual_values, availability_predictions)
    rmse = metrics.mean_squared_error(y_data_actual_values, availability_predictions)**0.5
    r2 = metrics.r2_score(y_data_actual_values, availability_predictions)
    return mae, mse, rmse, r2

In [63]:
def get_average_metric_value_over_splits(values_list):
    sum_values = 0
    for value in values_list:
            sum_values += value
    avg_value = sum_values/len(values_list)
    return avg_value

In [64]:
def add_avg_metrics_over_splits_to_station_dict(predictions_dict):
    
    for station_key in predictions_dict.keys():
        mae_values = []
        mse_values = []
        rmse_values = []
        r2_values = []
            
        for split_key in predictions_dict[station_key].keys():
            split_metrics_dict = predictions_dict[station_key][split_key]['Metrics']
             
            for metric_key in split_metrics_dict.keys():
                if metric_key == 'MAE':
                    mae_values.append(split_metrics_dict[metric_key])
                if metric_key == 'MSE':
                    mse_values.append(split_metrics_dict[metric_key])
                if metric_key == 'RMSE':
                    rmse_values.append(split_metrics_dict[metric_key])
                elif metric_key == 'R2':
                    r2_values.append(split_metrics_dict[metric_key])
        
        
        avg_mae = get_average_metric_value_over_splits(mae_values)
        avg_mse = get_average_metric_value_over_splits(mse_values)
        avg_rmse = get_average_metric_value_over_splits(rmse_values)
        avg_r2 = get_average_metric_value_over_splits(r2_values)
        
        predictions_dict[station_key]['AVG Metrics'] = {'AVG MAE': avg_mae, 'AVG MSE': avg_mse,
                                                        'AVG RMSE': avg_rmse, 'AVG R2': avg_r2}
    return predictions_dict