# Data Preparation

In this notebook we show how process and filter the CARTA dataset for modeling purposes. The main idea is model board count considering a set of predictors that are already part the CARTA dataset and some that need to be created.

We consider three main approaches to model demand:

1. Generalized Linear Models.
2. Random Forest and Zero-Inflated Random Forest.
3. Neural Networks.

The following libraries are required for running this notebook:

In [10]:
import numpy as np
import pandas as pd
import pickle
import os
import timeit

In [2]:
carta = pd.read_csv('data/Transit_Data/carta.csv', low_memory = False)

In [3]:
carta.stop_id = carta['stop_id'].astype(str)
carta.trip_id = carta['trip_id'].astype(str)
carta.hour = carta['hour'].astype(str)
carta.stop_sequence = carta['stop_sequence'].astype(int)
carta.route_id = carta['route_id'].astype(str)
carta.direction_id = carta['direction_id'].astype(str)
carta.service_period = carta['service_period'].astype(str)
carta.actual_arrival_time = pd.to_datetime(carta['actual_arrival_time'], format = "%Y-%m-%d %H:%M:%S")

In [4]:
carta.scheduled_arrival_time = pd.to_datetime(carta['scheduled_arrival_time'], format = "%Y-%m-%d %H:%M:%S")

In [5]:
carta.date = pd.to_datetime(carta['date'], format = "%Y-%m-%d")

In [6]:
carta['year'] = carta['date'].dt.year
carta['month'] = carta['date'].dt.month

In [7]:
carta = carta.sort_values(by = 'date')

In [11]:
carta.head()

Unnamed: 0,trip_id,scheduled_arrival_time,actual_arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,...,actual_arrival_datetime,trip_start_time,day_of_week,trip_date,hour,year,month,Estimated_Temp,Estimated_Precip,service_kind
0,132994,1900-01-01 04:55:00,1900-01-01 04:52:00,1351,1,35.052658,-85.309722,1,0,1,...,2019-01-02 04:52:00,1900-01-01 04:55:00,2,2019-01-02,4,2019,1,8.35,0.0,weekday
1193477,134417,1900-01-01 20:35:04,1900-01-01 20:37:00,971,47,35.024347,-85.246812,4,1,0,...,2019-01-02 20:37:00,1900-01-01 20:02:53,2,2019-01-02,20,2019,1,10.0,0.590889,weekday
1193478,134417,1900-01-01 20:35:30,1900-01-01 20:38:00,972,48,35.025717,-85.248528,4,1,0,...,2019-01-02 20:38:00,1900-01-01 20:02:53,2,2019-01-02,20,2019,1,10.0,0.585833,weekday
1193479,134409,1900-01-01 20:35:32,1900-01-01 20:38:00,12,87,35.050703,-85.309532,4,1,0,...,2019-01-02 20:38:00,1900-01-01 19:40:00,2,2019-01-02,20,2019,1,10.0,0.585444,weekday
1193480,134329,1900-01-01 20:35:44,1900-01-01 20:43:00,806,10,35.042653,-85.306685,4,0,0,...,2019-01-02 20:43:00,1900-01-01 20:30:00,2,2019-01-02,20,2019,1,10.0,0.583111,weekday


In [9]:
carta.iloc[5820870,]

trip_id                                 140116
scheduled_arrival_time     1900-01-01 16:51:42
actual_arrival_time        1900-01-01 16:54:00
stop_id                                     17
stop_sequence                                5
stop_lat                              35.04425
stop_lon                             -85.30943
route_id                                     9
direction_id                                 0
board_count                                  1
alight_count                                 1
occupancy                                    9
direction_desc                        OUTBOUND
service_period                         Weekday
date                       2019-09-20 00:00:00
scheduled_datetime         2019-09-20 16:51:42
actual_arrival_datetime    2019-09-20 16:54:00
trip_start_time            1900-01-01 16:50:00
day_of_week                                  4
trip_date                           2019-09-20
hour                                        16
year         

In [10]:
carta[(carta['actual_arrival_datetime'] == '2019-01-02 06:00:00') & (carta['trip_id'] == '133184') & (carta['stop_id'] == '145')]

Unnamed: 0,trip_id,scheduled_arrival_time,actual_arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,...,actual_arrival_datetime,trip_start_time,day_of_week,trip_date,hour,year,month,Estimated_Temp,Estimated_Precip,service_kind
5820870,133184,1900-01-01 06:00:00,1900-01-01 06:00:00,145,1,35.041675,-85.30864,10A,0,0,...,2019-01-02 06:00:00,1900-01-01 06:00:00,2,2019-01-02,6,2019,1,8.3,0.0,weekday


In [11]:
carta.shape

(9705963, 26)

In [12]:
carta.dtypes

trip_id                            object
scheduled_arrival_time     datetime64[ns]
actual_arrival_time        datetime64[ns]
stop_id                            object
stop_sequence                       int32
stop_lat                          float64
stop_lon                          float64
route_id                           object
direction_id                       object
board_count                         int64
alight_count                        int64
occupancy                           int64
direction_desc                     object
service_period                     object
date                       datetime64[ns]
scheduled_datetime                 object
actual_arrival_datetime            object
trip_start_time                    object
day_of_week                         int64
trip_date                          object
hour                               object
year                                int64
month                               int64
Estimated_Temp                    

In [13]:
min(carta.scheduled_datetime)

'2019-01-02 04:21:00'

In [14]:
max(carta.scheduled_datetime)

'2020-05-31 21:10:00'

In [15]:
carta.hour.unique()

array(['4', '20', '19', '21', '22', '6', '7', '5', '8', '9', '23', '0',
       '18', '17', '11', '10', '12', '13', '16', '14', '15'], dtype=object)

In [9]:
new_carta = pd.read_csv('data/Transit_Data/new_carta.csv')
new_carta.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,survey_date,pattern_id,route_number,trip_id,stop_id,time_actual_arrive,stop_lon,stop_lat,...,hour,temp,precip,board_count,alight_count,occupancy,route_id,direction_id,day_of_week,service_kind
0,0,0,2020-06-01 05:00:00,910,4.0,152323,354,2020-06-01 09:41:16,-85.268713,35.056167,...,9.0,18.85,0.0,0,0,0,4,0,0.0,weekday
1,1,1,2020-06-01 05:00:00,915,1.4,152320,354,2020-06-01 09:47:36,-85.268713,35.056167,...,9.0,18.85,0.0,0,0,0,1,0,0.0,weekday
2,2,2,2020-06-01 05:00:00,915,1.4,151689,354,2020-06-01 10:54:05,-85.268713,35.056167,...,10.0,19.95,0.0,0,0,0,1,0,0.0,weekday
3,3,3,2020-06-01 05:00:00,889,9.0,151936,354,2020-06-01 10:54:11,-85.268713,35.056167,...,10.0,19.95,0.0,0,0,0,9,0,0.0,weekday
4,4,4,2020-06-01 05:00:00,674,10.1,151970,354,2020-06-01 11:05:51,-85.268713,35.056167,...,11.0,21.15,0.0,0,0,0,10A,0,0.0,weekday


In [19]:
new_carta.time_actual_arrive = pd.to_datetime(new_carta['time_actual_arrive'], format = "%Y-%m-%d %H:%M:%S")

In [20]:
new_carta['year'] = new_carta['time_actual_arrive'].dt.year

## Data partition according to lock-down and ridership decline

Chattanooga's steep decline started the week of **March 5th** before hitting a low also on the week of April 19 with a low of 8,077 weekly riders.

 - Pre-lockdown data: Before 2020-03-05.
     * `date < 2020-03-05`
 - Post-lockdown data: After 2020-03-05.
     * `date >= 2020-03-05`

In [16]:
Pre_lock = carta[carta['date'] < '2020-03-05']
#Pre_lock.to_csv('data/Transit_Data/Pre_lock.csv')

Post_lock = carta[carta['date'] >= '2020-03-05']
#Post_lock.to_csv('data/Transit_Data/Post_lock.csv')

### Pre-lockdown data:

In [17]:
Pre_lock.head()

Unnamed: 0,trip_id,scheduled_arrival_time,actual_arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,...,actual_arrival_datetime,trip_start_time,day_of_week,trip_date,hour,year,month,Estimated_Temp,Estimated_Precip,service_kind
0,132994,1900-01-01 04:55:00,1900-01-01 04:52:00,1351,1,35.052658,-85.309722,1,0,1,...,2019-01-02 04:52:00,1900-01-01 04:55:00,2,2019-01-02,4,2019,1,8.35,0.0,weekday
1193477,134417,1900-01-01 20:35:04,1900-01-01 20:37:00,971,47,35.024347,-85.246812,4,1,0,...,2019-01-02 20:37:00,1900-01-01 20:02:53,2,2019-01-02,20,2019,1,10.0,0.590889,weekday
1193478,134417,1900-01-01 20:35:30,1900-01-01 20:38:00,972,48,35.025717,-85.248528,4,1,0,...,2019-01-02 20:38:00,1900-01-01 20:02:53,2,2019-01-02,20,2019,1,10.0,0.585833,weekday
1193479,134409,1900-01-01 20:35:32,1900-01-01 20:38:00,12,87,35.050703,-85.309532,4,1,0,...,2019-01-02 20:38:00,1900-01-01 19:40:00,2,2019-01-02,20,2019,1,10.0,0.585444,weekday
1193480,134329,1900-01-01 20:35:44,1900-01-01 20:43:00,806,10,35.042653,-85.306685,4,0,0,...,2019-01-02 20:43:00,1900-01-01 20:30:00,2,2019-01-02,20,2019,1,10.0,0.583111,weekday


In [23]:
Pre_lock.shape

(8602818, 26)

### Post-lockdown data:

In [24]:
Post_lock.head()

Unnamed: 0,trip_id,scheduled_arrival_time,actual_arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,...,actual_arrival_datetime,trip_start_time,day_of_week,trip_date,hour,year,month,Estimated_Temp,Estimated_Precip,service_kind
878279,138360,1900-01-01 12:46:57,1900-01-01 12:47:00,22,14,35.03465,-85.30826,1,0,0,...,2020-03-05 12:47:00,1900-01-01 12:40:00,3,2020-03-05,12,2020,3,12.8,0.0,weekday
7369067,138665,1900-01-01 06:17:47,1900-01-01 06:16:00,386,30,35.02987,-85.268952,10G,1,0,...,2020-03-05 06:16:00,1900-01-01 06:14:14,3,2020-03-05,6,2020,3,9.822167,0.0,weekday
877188,138383,1900-01-01 05:49:42,1900-01-01 05:48:00,81,13,34.990302,-85.30489,1,1,0,...,2020-03-05 05:48:00,1900-01-01 05:45:00,3,2020-03-05,5,2020,3,10.0,0.0,weekday
877187,138383,1900-01-01 05:49:32,1900-01-01 05:48:00,146,12,34.989585,-85.30462,1,1,0,...,2020-03-05 05:48:00,1900-01-01 05:45:00,3,2020-03-05,5,2020,3,10.0,0.0,weekday
877186,138383,1900-01-01 05:49:19,1900-01-01 05:48:00,155,11,34.989323,-85.305493,1,1,0,...,2020-03-05 05:48:00,1900-01-01 05:45:00,3,2020-03-05,5,2020,3,10.0,0.0,weekday


## Subordinate Functions

The following functions are to used to transform the data. We transformed the data because the considered predictors have different ranges of magnitude, and that can be troublesome for model interpretation, particularly for the generalized linear models. We decided to normalize the data so the variables would be in the $[0, \:1]$ interval which can be useful to avoid negative outcomes.

In [28]:
def normalizer(x):
    
    x_min = min(x)
    x_max = max(x)
    x_norm = (x - x_min)/(x_max - x_min)
    
    return(x_norm)

In [29]:
def standardizer(x):
    x_std = x.std()
    x_mean = x.mean()
    x_standard = (x - x_mean)/x_std
    
    return(x_standard)

## Surrounding Board or Alight Counts

In this section we create a function called `radial_influence()` that takes three inputs: 

* `st`: bus stop.
* `radius`: length of the radius. It has a default value of 0.5 miles.
* `DT`: Data frame with the list of `stop_id`s and their coordinates.

and identifies all the bus stops (`stop_id`s) that are within 0.5 mile radius with respect to the bus stop of reference. The bus stops that are within the desired range will be labeled as `inside`, and we use the label `outside` for the opposite case.

In [24]:
from shapely.geometry import Point, MultiPoint
from shapely.ops import nearest_points
#import geopandas as gpd
import folium
from geopy import Point
from geopy import distance

The `Bus_Stop` data frame contains the spatial coordinates of all the bus stops in the `carta` data set.

In [23]:
Bus_Stops = pd.read_csv('data/Transit_Data/Bus_Stops.csv')
Bus_Stops.stop_id = Bus_Stops['stop_id'].astype(str)
Bus_Stops.head()

Unnamed: 0,stop_id,stop_lon,stop_lat
0,971,-85.246812,35.024347
1,146,-85.30462,34.989585
2,1545,-85.250863,35.026032
3,972,-85.248528,35.025717
4,81,-85.30489,34.990302


We can plot some of the bus stops as follows:

In [30]:
map = folium.Map(location = [Bus_Stops.stop_lat.mean(), Bus_Stops.stop_lon.mean()], tiles = "OpenStreetMap", zoom_start = 11)

for point in range(0, Bus_Stops.shape[0])[0:100]:
    folium.Marker(location = [Bus_Stops.stop_lat[point], Bus_Stops.stop_lon[point]], popup = str(Bus_Stops['stop_id'])).add_to(map)
map

map

### Radial Influence

In [25]:
def radial_influence(st, radius, DT):
    n_row = DT.shape[0]
    
    stopd_index = DT[DT['stop_id'] == st].index[0]
    x0 = DT.stop_lon.iloc[stopd_index]
    y0 = DT.stop_lat.iloc[stopd_index]
    
    radial_dist = []
    condition = []
    
    for i in range(n_row):
        
        center_point = (x0, y0)
        test_point = (DT.stop_lon.iloc[i], DT.stop_lat.iloc[i])
        dist = distance.distance(center_point, test_point).miles
        
        radial_dist.append(dist)
        
        if (dist <= radius):
            
            cond = 'inside'
            
        else:
            
            cond = 'outside'
            
        condition.append(cond)
        
    influence = {'stop_id': DT.stop_id, 'Radial_Distance': radial_dist, 'Influence': condition}
    Influence = pd.DataFrame(data = influence, columns = ['stop_id', 'Radial_Distance', 'Influence'])
    
    return(Influence)

### Example

Let's plot the half-mile radius from bus stop 12 (`stop_id == '12'`):

In [26]:
Surrounding_Stops_12 = radial_influence('12', 0.5, Bus_Stops)
Surrounding_Stops_12.head()

Unnamed: 0,stop_id,Radial_Distance,Influence
0,971,4.355288,outside
1,146,0.486455,inside
2,1545,4.073987,outside
3,972,4.236001,outside
4,81,0.47053,inside


## Data Extraction

The function `data_extraction()` requires five inputs:

* `route`: a `route_id` value.
* `direction`: a `direction_id` value.
* `bus_stop`: a `stop_id` value.
* `DTFRM`: a data frame with all the required variables like `carta`.
* `part`: `pre` or `post` according to the data partition.

The output is a data frame `*_lock_data.csv` with the required data.

In [36]:
def data_extraction(route, direction, bus_stop, DTFRM, part):
    import numpy as np
    import pandas as pd
    import datetime as dt
             
    dtfrm = DTFRM[(DTFRM['route_id'] == route) & (DTFRM['direction_id'] == direction) & (DTFRM['stop_id'] == bus_stop)]
    
    #return(carta)

    n_rows = dtfrm.shape[0]
    
    if (n_rows == 0):
        print('There are not rows in the data set with the required characteristics. Please change them.')
        
    else:
          
        vinit = dtfrm[['month', 'service_kind', 'hour', 'board_count', 'alight_count', 'Estimated_Temp', 'Estimated_Precip']].groupby(['month', 'service_kind', 'hour']).mean()
        Vinit = vinit.reset_index(level = ['month', 'service_kind', 'hour'])
        Vinit.columns = ['month', 'service_kind', 'hour', 'month_average_board_count', 'month_average_alight_count', 'mean_temp', 'mean_precip']
        Vinit = Vinit.drop_duplicates()

        #return(Vinit)
    
        V1 = dtfrm[[ 'month', 'service_kind', 'hour', 'board_count', 'alight_count']]
        V1.columns = ['month', 'service_kind', 'hour', 'board_count', 'alight_count']
     
        V1 = pd.merge(V1, Vinit, on = ['month', 'service_kind', 'hour'], how = 'left')

        #return(V1)
    
        Relevant_Bus_Stops = radial_influence(bus_stop, 0.5, Bus_Stops)
        bus_stops_ids = Relevant_Bus_Stops.stop_id.unique().astype(str)
        bus_stops_ids = pd.DataFrame(bus_stops_ids)
        bus_stops_ids.columns = ['stop_id']
        bus_stops_ids = bus_stops_ids[bus_stops_ids['stop_id'] != bus_stop]
        bus_stops_ids = bus_stops_ids.drop_duplicates()
               
        V3 = DTFRM[DTFRM['stop_id'].isin(bus_stops_ids.stop_id.astype(str))][['month', 'service_kind', 'hour', 'board_count', 'alight_count']]
        V3 = V3.drop_duplicates()
        V3a = V3.groupby(['month', 'service_kind', 'hour']).mean()
        V3b = V3a.reset_index(level = ['month', 'service_kind', 'hour'])
        V3b.columns = ['month', 'service_kind', 'hour', 'surrounding_board_count', 'surrounding_alight_count']
        V3b = V3b.drop_duplicates()

        #return(V3b)
    
        data = pd.merge(V1, V3b, how = 'left', on = [ 'month', 'service_kind', 'hour'])

        #return(data)
                
        data.month = data['month'].astype('category')
        data.service_kind = data['service_kind'].astype('category')
        data.hour = data['hour'].astype('category')
       
        data.columns = ['month', 'service_kind', 'hour', 'board_count', 'alight_count', 'mean_temp', 'mean_precip',
                        'month_average_board_count', 'month_average_alight_count', 'surrounding_board_count', 'surrounding_alight_count']
        data = data.drop_duplicates()

        #return(data)
        
        data.month_average_board_count = normalizer(data['month_average_board_count'])
        data.surrounding_board_count = normalizer(data['surrounding_board_count'])
        
        data.mean_temp = normalizer(data['mean_temp'])
        data.mean_precip = normalizer(data['mean_precip'])
        
        data = data[['month', 'service_kind', 'hour', 'board_count', 'mean_temp', 'mean_precip','month_average_board_count', 'surrounding_board_count']]

        #return(data)
        
        data = data.reset_index(drop = True)
        #data = data.dropna(axis = 0, how = 'any', inplace = True)
        
        #return(data)
        #--------------------------------------------------------------------------------------------------------------------
        # Export data:
        
        def paste0(ss,sep=None,na_rep=None,):
            ss = [pd.Series(s) for s in ss]
            ss = [s.astype(str) for s in ss]
            s = ss[0]
            res = s.str.cat(ss[1:],sep=sep,na_rep=na_rep)
            return res
               
        file_path = '/'.join(['data', 'Data_for_RF_Models', 'Board_Counts', '_'.join(['route', route]), ''.join(['direction', direction]), '_'.join(['bus_stop', bus_stop])])
        
        #file_path = paste0(paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
        #                                                paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
        #                                        paste0(['route', route], sep = '_')], sep = '/'),
        #                                paste0(['direction', direction])], sep = '/'),
        #                        paste0(['bus_stop', bus_stop], sep = '_')], sep = '/'), sep = '/')[0]
        
        #return(file_path)
        #os.mkdir(file_path)

        #return(data)
        
        if (part == 'all'):
            
            file_name = paste0(['data', '.csv'])
            complete_path = paste0([file_path, file_name], sep = '/')[0]
            data.to_csv(complete_path)
            
        elif (part == 'pre'):
            
            file_name = paste0(['pre_lock_data', '.csv'])
            complete_path = paste0([file_path, file_name], sep = '/')[0]
            data.to_csv(complete_path)
            
        elif (part == 'post'):
            
            file_name = paste0(['post_lock_data', '.csv'])
            complete_path = paste0([file_path, file_name], sep = '/')[0]
            data.to_csv(complete_path)

        return('Done!')
            
        

In [42]:
Post_lock[(Post_lock['route_id'] == '10A') & (Post_lock['direction_id'] == '1')]['stop_id'].unique()

array(['288', '233', '234', '235', '236', '237', '238', '505', '690',
       '253', '273', '274', '275', '276', '277', '278', '279', '280',
       '281', '282', '283', '284', '285', '286', '287', '1351', '272',
       '254', '271', '255', '256', '257', '219', '220', '311', '259',
       '260', '261', '262', '263', '264', '265', '266', '1584', '1583',
       '269', '270', '258', '12', '1555', '1579', '1354', '1353', '17',
       '742', '805', '806', '807', '1475', '1870', '211', '212', '213',
       '214', '215', '216', '1997', '2187', '1939', '2176', '2068',
       '2076', '2177', '2178', '2139', '2179', '2180', '1922', '2010',
       '217'], dtype=object)

### Example: 

Let's extract the data for modeling considering the following inputs:

* `route_id == '1'`
* `direction_id == 0`
* `stop_id == '12'`
* `DTFRM == carta`
* `part == 'pre'`

In [55]:
data_extraction('10A', '1', '217', Post_lock, 'post')

'Done!'

In [72]:
data1.shape

(3150, 8)

We can use a `for loop` to run this process for each bus stop. We declare the function `paste0()` to create the file paths from the input parameters:

In [12]:
def paste0(ss,sep=None,na_rep=None,):
            '''Analogy to R paste0'''
            ss = [pd.Series(s) for s in ss]
            ss = [s.astype(str) for s in ss]
            s = ss[0]
            res = s.str.cat(ss[1:],sep=sep,na_rep=na_rep)
            return res

## For Loop for all the bus stops in `route_id == '4'` and `direction_id == '1'`

In [14]:
def get_bus_stop_data(rt_id, dir_id, part):
    
    if (part == 'pre'):
        
        bus_stops_rX_dirY = Pre_lock[(Pre_lock['route_id'] == rt_id) & (Pre_lock['direction_id'] == dir_id)]['stop_id'].unique()
           
        for st in bus_stops_rX_dirY:
            print(st)
            data_extraction(rt_id, dir_id, st, Pre_lock, 'pre')
            print('Done!')
                        
    elif (part == 'post'):
        
        bus_stops_rX_dirY = Post_lock[(Post_lock['route_id'] == rt_id) & (Post_lock['direction_id'] == dir_id)]['stop_id'].unique()
           
        for st in bus_stops_rX_dirY:
            print(st)
            data_extraction(rt_id, dir_id, st, Post_lock, 'post')
            print('Done!')
            
    elif (part == 'all'):
        
        bus_stops_rX_dirY = carta[(carta['route_id'] == rt_id) & (carta['direction_id'] == dir_id)]['stop_id'].unique()
           
        for st in bus_stops_rX_dirY:
            print(st)
            data_extraction(rt_id, dir_id, st, carta, 'all')
            print('Done!')
        
    return('Data preparation is complete!')

In [22]:
Pre_lock[(Pre_lock['route_id'] == '2') & (Pre_lock['direction_id'] == '0')]['stop_id'].unique()

array(['1821', '1827', '166', '1361', '165', '164', '163', '145', '730',
       '729', '728', '727', '726', '724', '1694', '1825', '1458', '1589',
       '1560', '505', '354', '1713'], dtype=object)

In [53]:
Pre_lock.shape[0]

8602818

Finally, we do this process for each combination of `route_id`, `direciton_id`, and data partition:

In [59]:
get_bus_stop_data('2', '1', 'post')
#get_bus_stop_data('1', '0', 'post')

1837
Done!
1872
Done!
12
Done!
1838
Done!
1867
Done!
1839
Done!
1840
Done!
734
Done!
733
Done!
732
Done!
731
Done!
730
Done!
1555
Done!
1579
Done!
690
Done!
505
Done!
1475
Done!
805
Done!
742
Done!
17
Done!
1353
Done!
1354
Done!
806
Done!
807
Done!


'Data preparation is complete!'

In [47]:
get_bus_stop_data('8', '1', 'pre')

679
Done!
1043
Done!
1044
Done!
1047
Done!
1048
Done!
1049
Done!
1492
Done!
1051
Done!
1052
Done!
298
Done!
299
Done!
300
Done!
1054
Done!
1055
Done!
1057
Done!
1058
Done!
1059
Done!
1060
Done!
1061
Done!
1493
Done!
1064
Done!
1065
Done!
297
Done!
1846
Done!
629
Done!
630
Done!
631
Done!
632
Done!
1836
Done!
1834
Done!
671
Done!
672
Done!
673
Done!
675
Done!
1844
Done!
1070
Done!
1071
Done!
1072
Done!
1074
Done!
1075
Done!
1076
Done!
1077
Done!
1078
Done!
1494
Done!
1079
Done!
1080
Done!
1082
Done!
1551
Done!
1073
Done!
628
Done!
1845
Done!
1516
Done!
627
Done!
354
Done!
1835
Done!


'Data preparation is complete!'

In [84]:
get_bus_stop_data('1', '1', 'pre')
#get_bus_stop_data('1', '1', 'post')
#get_bus_stop_data('1', '1', 'all')

1731
Done!
148
Done!
1899
Done!
1486
Done!
142
Done!
81
Done!
141
Done!
140
Done!
139
Done!
138
Done!
1797
Done!
157
Done!
75
Done!
137
Done!
74
Done!
136
Done!
1892
Done!
72
Done!
1915
Done!
1891
Done!
134
Done!
133
Done!
1487
Done!
95
Done!
143
Done!
151
Done!
118
Done!
1912
Done!
116
Done!
115
Done!
114
Done!
113
Done!
112
Done!
111
Done!
110
Done!
109
Done!
771
Done!
108
Done!
145
Done!
106
Done!
105
Done!
104
Done!
103
Done!
102
Done!
101
Done!
100
Done!
144
Done!
152
Done!
107
Done!
1913
Done!
132
Done!
130
Done!
129
Done!
128
Done!
127
Done!
126
Done!
125
Done!
124
Done!
1358
Done!
1914
Done!
1582
Done!
166
Done!
1589
Done!
1361
Done!
1684
Done!
167
Done!
165
Done!
1488
Done!
163
Done!
1560
Done!
164
Done!
169
Done!
505
Done!
172
Done!
171
Done!
170
Done!
90
Done!
150
Done!
690
Done!
146
Done!
155
Done!
158
Done!
161
Done!
44
Done!


'Data preparation is complete!'

In [85]:
get_bus_stop_data('4', '0', 'pre')
#get_bus_stop_data('4', '0', 'post')
#get_bus_stop_data('4', '0', 'all')

806
Done!
807
Done!
1475
Done!
1474
Done!
811
Done!
812
Done!
805
Done!
1351
Done!
12
Done!
1555
Done!
813
Done!
1579
Done!
1354
Done!
1353
Done!
17
Done!
742
Done!
322
Done!
830
Done!
831
Done!
832
Done!
833
Done!
834
Done!
835
Done!
321
Done!
836
Done!
838
Done!
839
Done!
840
Done!
841
Done!
842
Done!
843
Done!
844
Done!
845
Done!
837
Done!
814
Done!
815
Done!
816
Done!
817
Done!
818
Done!
829
Done!
819
Done!
821
Done!
822
Done!
823
Done!
824
Done!
825
Done!
826
Done!
827
Done!
828
Done!
820
Done!
846
Done!
848
Done!
1875
Done!
894
Done!
1485
Done!
897
Done!
898
Done!
891
Done!
889
Done!
873
Done!
874
Done!
875
Done!
876
Done!
877
Done!
879
Done!
880
Done!
890
Done!
881
Done!
882
Done!
883
Done!
884
Done!
885
Done!
886
Done!
887
Done!
888
Done!
872
Done!
869
Done!
870
Done!
871
Done!
868
Done!
867
Done!
849
Done!
850
Done!
851
Done!
852
Done!
854
Done!
855
Done!
856
Done!
857
Done!
858
Done!
859
Done!
860
Done!
1873
Done!
1848
Done!
1849
Done!
1850
Done!
861
Done!
1819
Done!
1876
Don

'Data preparation is complete!'

In [40]:
#get_bus_stop_data('4', '1', 'pre')

In [51]:
get_bus_stop_data('4', '1', 'post')

'Done!'

In [31]:
get_bus_stop_data('9', '0', 'pre')
get_bus_stop_data('9', '0', 'post')

'Done!'

In [36]:
get_bus_stop_data('9', '1', 'pre')
get_bus_stop_data('9', '1', 'post')

'Done!'

## Data Partition into train and test sets

The function`data_partition_for_ml()` takes the extracted data (created by using the `get_bus_stop_data()` function) and creates a random partition for the train (80%) and test sets (20%).  

In [32]:
def data_partition_for_ml(rt, di, st, part):
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    def paste0(ss,sep=None,na_rep=None,):
            ss = [pd.Series(s) for s in ss]
            ss = [s.astype(str) for s in ss]
            s = ss[0]
            res = s.str.cat(ss[1:],sep=sep,na_rep=na_rep)
            return res
    
    if (part == 'all'):
          
        file_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                        paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                paste0(['route', rt], sep = '_')], sep = '/'),
                                        paste0(['direction', di])], sep = '/'),
                                paste0(['bus_stop', st], sep = '_')], sep = '/'), 'data.csv'], sep = '/')[0]
          
        dtt = pd.read_csv(file_path)
    
        train_data, test_data = train_test_split(dtt, test_size=0.2)
    
        train_data = train_data[['month', 'service_kind', 'hour', 'board_count', 'mean_temp',
                                 'mean_precip', 'month_average_board_count', 'surrounding_board_count']]
        
        train_data.dropna(how = 'all', axis = 1, inplace = True)
    
        test_data = test_data[['month', 'service_kind', 'hour', 'board_count', 'mean_temp',
                               'mean_precip', 'month_average_board_count', 'surrounding_board_count']]
        
        test_data.dropna(how = 'all', axis = 1, inplace = True)
    
        train_data_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                                  paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                          paste0(['route', rt], sep = '_')], sep = '/'),
                                                  paste0(['direction', di])], sep = '/'),
                                          paste0(['bus_stop', st], sep = '_')], sep = '/'), 'train_data.csv'], sep = '/')[0]
           
        test_data_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                            paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                    paste0(['route', rt], sep = '_')], sep = '/'),
                                            paste0(['direction', di])], sep = '/'),
                                    paste0(['bus_stop', st], sep = '_')], sep = '/'), 'test_data.csv'], sep = '/')[0]
    
        train_data.to_csv(train_data_path, index = False)
        test_data.to_csv(test_data_path, index = False)
        
    elif (part == 'pre'):
        
        file_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                        paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                paste0(['route', rt], sep = '_')], sep = '/'),
                                        paste0(['direction', di])], sep = '/'),
                                paste0(['bus_stop', st], sep = '_')], sep = '/'), 'pre_lock_data.csv'], sep = '/')[0]
          
        dtt = pd.read_csv(file_path)

        data_nrow = dtt.shape[0]

        #return(data_nrow)

        if (data_nrow > 60):

                train_data, test_data = train_test_split(dtt, test_size=0.2)

                train_data = train_data[['month', 'service_kind', 'hour', 'board_count', 'mean_temp',
                                'mean_precip', 'month_average_board_count', 'surrounding_board_count']]
        
                train_data.dropna(how = 'all', axis = 1, inplace = True)
    
                test_data = test_data[['month', 'service_kind', 'hour', 'board_count', 'mean_temp',
                                       'mean_precip', 'month_average_board_count', 'surrounding_board_count']]
        
                test_data.dropna(how = 'all', axis = 1, inplace = True)
    
                train_data_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                                  paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                          paste0(['route', rt], sep = '_')], sep = '/'),
                                                  paste0(['direction', di])], sep = '/'),
                                          paste0(['bus_stop', st], sep = '_')], sep = '/'), 'pre_lock_train_data.csv'], sep = '/')[0]
           
                test_data_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                            paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                    paste0(['route', rt], sep = '_')], sep = '/'),
                                            paste0(['direction', di])], sep = '/'),
                                    paste0(['bus_stop', st], sep = '_')], sep = '/'), 'pre_lock_test_data.csv'], sep = '/')[0]
    
                train_data.to_csv(train_data_path, index = False)
                test_data.to_csv(test_data_path, index = False)
        
        elif (data_nrow <= 60):
                return('There is not enough records to create train and test sets.')


    elif (part == 'post'):
        
        file_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                        paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                paste0(['route', rt], sep = '_')], sep = '/'),
                                        paste0(['direction', di])], sep = '/'),
                                paste0(['bus_stop', st], sep = '_')], sep = '/'), 'post_lock_data.csv'], sep = '/')[0]
        
        file2_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                        paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                paste0(['route', rt], sep = '_')], sep = '/'),
                                        paste0(['direction', di])], sep = '/'),
                                paste0(['bus_stop', st], sep = '_')], sep = '/'), 'new_data.csv'], sep = '/')[0]
          
        dtt = pd.read_csv(file_path)
        new_dtt = pd.read_csv(file2_path)
            
        train_data, test_data = train_test_split(dtt, test_size=0.2)

        train_data_nrow = train_data.shape[0]

        if train_data_nrow > 60:
     
                train_data = train_data[['month', 'service_kind', 'hour', 'board_count', 'mean_temp',
                                        'mean_precip', 'month_average_board_count', 'surrounding_board_count']]
        
                train_data.dropna(how = 'all', axis = 1, inplace = True)
    
                test_data = test_data[['month', 'service_kind', 'hour', 'board_count', 'mean_temp',
                                       'mean_precip', 'month_average_board_count', 'surrounding_board_count']]

                test_data.dropna(how = 'all', axis = 1, inplace = True)
        
                train_data_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                                  paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                          paste0(['route', rt], sep = '_')], sep = '/'),
                                                  paste0(['direction', di])], sep = '/'),
                                          paste0(['bus_stop', st], sep = '_')], sep = '/'), 'post_lock_train_data.csv'], sep = '/')[0]
           
                test_data_path = paste0([paste0([paste0([paste0([paste0([paste0(['data'], sep = '/'), 
                                                            paste0(['Data_for_RF_Models', 'Board_Counts'], sep ='/')], sep = '/'),
                                                    paste0(['route', rt], sep = '_')], sep = '/'),
                                            paste0(['direction', di])], sep = '/'),
                                    paste0(['bus_stop', st], sep = '_')], sep = '/'), 'post_lock_test_data.csv'], sep = '/')[0]
    
                train_data.to_csv(train_data_path, index = False)
                test_data.to_csv(test_data_path, index = False)
    
    return('Done!')
        

### Example:

Let's create a data partition for `stop_id == 354` and the following data inputs:

* `route_id == '9'`
* `direction_id == '0'`
* `stop_id == '354'`
* `part == 'post'`

In [65]:
data_partition_for_ml('8', '0', '354', 'pre')

'There is not enough records to create train and test sets.'

Similarly, we can iterate this process by using the `data_partition_for_ml()` inside a  `for loop` whose counter changes according to the `route_id`, `direction_id`, data partition:

In [33]:
def get_data_partition_bus_stop_ml(rt_id, dir_id, part):
    
    if (part == 'all'):
            
        bus_stops_rX_dirY = carta[(carta['route_id'] == rt_id) & (carta['direction_id'] == dir_id)]['stop_id'].unique()
    
        for st in bus_stops_rX_dirY:
            data_partition_for_ml(rt_id, dir_id, st, 'all')
            
    elif (part == 'pre'):
        
        bus_stops_rX_dirY = Pre_lock[(Pre_lock['route_id'] == rt_id) & (Pre_lock['direction_id'] == dir_id)]['stop_id'].unique()
    
        for st in bus_stops_rX_dirY:
            data_partition_for_ml(rt_id, dir_id, st, 'pre')
            
    elif (part == 'post'):
        
        bus_stops_rX_dirY = Post_lock[(Post_lock['route_id'] == rt_id) & (Post_lock['direction_id'] == dir_id)]['stop_id'].unique()
    
        for st in bus_stops_rX_dirY:
            data_partition_for_ml(rt_id, dir_id, st, 'post')
        
    return('Done!')

Finally, we can automatize this process for the desired input values:

In [55]:
data_partition_for_ml('1', '0', '1351', 'pre')

'Done!'

In [67]:
get_data_partition_bus_stop_ml('8', '0', 'pre')

'Done!'

In [43]:
get_data_partition_bus_stop_ml('1', '1', 'pre')
get_data_partition_bus_stop_ml('1', '1', 'post')

'Done!'

In [50]:
get_data_partition_bus_stop_ml('4', '0', 'pre')
get_data_partition_bus_stop_ml('4', '0', 'post')

'Done!'

In [52]:
get_data_partition_bus_stop_ml('4', '1', 'pre')
get_data_partition_bus_stop_ml('4', '1', 'post')

'Done!'

In [35]:
get_data_partition_bus_stop_ml('9', '0', 'pre')
get_data_partition_bus_stop_ml('9', '0', 'post')

'Done!'

In [39]:
get_data_partition_bus_stop_ml('9', '1', 'pre')
get_data_partition_bus_stop_ml('9', '1', 'post')

'Done!'

In [34]:
get_data_partition_bus_stop_ml('10A', '0', 'pre')
get_data_partition_bus_stop_ml('10A', '1', 'pre')

'Done!'

In [57]:
get_data_partition_bus_stop_ml('10A', '0', 'post')
get_data_partition_bus_stop_ml('10A', '1', 'post')

FileNotFoundError: [Errno 2] No such file or directory: 'data/Data_for_RF_Models/Board_Counts/route_10A/direction0/bus_stop_216/new_data.csv'

In [158]:
get_data_partition_bus_stop('10C', '0')
get_data_partition_bus_stop('10C', '1')

'Done!'

In [160]:
get_data_partition_bus_stop('13', '0')
get_data_partition_bus_stop('13', '1')

'Done!'

In [161]:
get_data_partition_bus_stop('15', '0')
get_data_partition_bus_stop('15', '1')

'Done!'

In [163]:
get_data_partition_bus_stop('19', '0')
get_data_partition_bus_stop('19', '1')

'Done!'

In [164]:
get_data_partition_bus_stop('21', '0')
get_data_partition_bus_stop('21', '1')

'Done!'

In [165]:
get_data_partition_bus_stop('28', '0')
get_data_partition_bus_stop('28', '1')

'Done!'