# Prediction Model with Random Forest

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pickle
import pandas as pd
import matplotlib.pyplot as plt

### Import functions from "functions.ipynb"

In [2]:
%run functions.ipynb

### Get lists from "data_prep.ipynb"
**To load the %stored variables run the cells in data_prep.ipynb first**

In [3]:
%store -r all_features
# get length of list for determining max_features in randomForestRegressor()
# https://stats.stackexchange.com/questions/324370/references-on-number-of-features-to-use-in-random-forest-regression
p = len(all_features)

In [4]:
%store -r station_id_list full_hour_list weekdays_list

### Loading dataframe from pickle file
**Before running the following cells create pickle file by runnning data-prep.ipynb**

In [5]:
filename = 'df_station_updates_all_features'
infile = open(filename,'rb')
df_station_updates_all_features = pickle.load(infile)
infile.close()

In [6]:
print(type(df_station_updates_all_features))

<class 'pandas.core.frame.DataFrame'>


In [7]:
df_station_updates_all_features.head()

Unnamed: 0,stationId,totalStands,availableBikes,freeStands,lastUpdate,temperature,cloudiness,windSpeed,rain,snow,...,mainWeather_Fog,mainWeather_Mist,mainWeather_Rain,mainWeather_Snow,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,2,20,10,10,2021-02-27 17:35:14,9.34,75.0,2.57,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,2,20,10,10,2021-02-27 17:45:23,9.2,75.0,1.03,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
2,2,20,10,10,2021-02-27 17:55:32,9.48,75.0,1.03,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
3,2,20,11,9,2021-02-27 18:00:42,9.22,75.0,1.03,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
4,2,20,11,9,2021-02-27 18:10:51,8.85,75.0,0.51,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
df_station_updates_all_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 586132 entries, 0 to 586131
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   stationId            586132 non-null  object        
 1   totalStands          586132 non-null  int64         
 2   availableBikes       586132 non-null  int64         
 3   freeStands           586132 non-null  int64         
 4   lastUpdate           586132 non-null  datetime64[ns]
 5   temperature          586132 non-null  float64       
 6   cloudiness           586132 non-null  float64       
 7   windSpeed            586132 non-null  float64       
 8   rain                 586132 non-null  float64       
 9   snow                 586132 non-null  float64       
 10  fullHour             586132 non-null  int64         
 11  hour_sin             586132 non-null  float64       
 12  hour_cos             586132 non-null  float64       
 13  mainWeather_Cl

### Training models with multiple linear regression

In [9]:
# this function fits a random forest model on full dataset
# returns the model learned
def apply_model(X, y):
    # Instantiate model with 100 decision trees
    model = RandomForestRegressor(n_estimators = 200, max_features = int(p/3) ).fit(X, y)
    return model

In [10]:
def model_per_station(df, station_id_list, features):
    
    # initialise a dict that will store a model per station
    model_dict = {}
   
    for station_id in station_id_list:
        # create a key with each station Id and assign another dict as a value that will hold the features and models learned
        # model_dict[station_id] = {'Features': features}

        # get rows with stationId == station_id separated by descriptive features and target feature
        df_station_data = filter_station_data(df, station_id)

        # get station data ordered by lastUpdate to make sure data is sequential timewise
        df_station_time_ordered = order_df_by_lastUpdate(df_station_data)

        # separate descriptive features from target feature
        X, y = get_station_features_and_target(df_station_time_ordered, features)
        
        # fit model for stationId
        model_for_station = apply_model(X, y)
        
        # store the model learned for the station in model_dict
        model_dict[station_id] = model_for_station
        
                                                         
    # return features all models learned for every station in a dict
    return model_dict

In [11]:
model_per_station = model_per_station(df_station_updates_all_features, station_id_list, all_features)

In [13]:
num_stations = list(model_per_station.items())
for key,value in num_stations:
    filename = 'randomForest_stationID_{}'.format(key)
    outfile = open(filename, 'wb')
    pickle.dump((key,value),outfile)
    outfile.close()