In [1]:
import numpy as np
import pandas as pd

In [2]:
carta = pd.read_csv('Data/Carta_out.csv')

In [3]:
carta.stop_id = carta['stop_id'].astype(str)
carta.stop_sequence = carta['stop_sequence'].astype(int)
carta.route_id = carta['route_id'].astype(str)
carta.direction_id = carta['direction_id'].astype(str)
carta.service_period = carta['service_period'].astype(str)
carta.actual_arrival_time = pd.to_datetime(carta['actual_arrival_time'], format = "%Y-%m-%d %H:%M:%S")
carta.scheduled_arrival_time = pd.to_datetime(carta['scheduled_arrival_time'], format = "%Y-%m-%d %H:%M:%S")
carta.date = pd.to_datetime(carta['date'], format = "%Y-%m-%d")
carta.scheduled_datetime = pd.to_datetime(carta['scheduled_datetime'], format = "%Y-%m-%d %H:%M:%S")
carta.trip_date = pd.to_datetime(carta['trip_date'], format="%Y-%m-%d")
carta.trip_start_time = pd.to_datetime(carta['trip_start_time'], format="%Y-%m-%d %H:%M:%S")

In [4]:
carta['year'] = carta['date'].dt.year
carta['month'] = carta['date'].dt.month

In [5]:
service_period = carta['service_period']

service_kind = []

for i in service_period:
    if (i == 'Saturday'):
        kind = 'weekend'
    elif (i == 'Sunday'):
        kind = 'weekend'
    else:
        kind = 'weekday'
        
    service_kind.append(kind)

In [6]:
carta.head()

Unnamed: 0,trip_id,scheduled_arrival_time,actual_arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,...,scheduled_datetime,actual_arrival_datetime,trip_start_time,day_of_week,trip_date,hour,year,month,Estimated_Temp,Estimated_Precip
0,134313,1900-01-01 04:21:00,1900-01-01 04:22:00,354,1,35.056167,-85.268713,4,0,0,...,2019-01-02 04:21:00,2019-01-02 04:22:00,1900-01-01 04:21:00,2,2019-01-02,4,2019,1,8.69,0.0
1,134313,1900-01-01 04:23:51,1900-01-01 04:24:00,505,2,35.056017,-85.28108,4,0,0,...,2019-01-02 04:23:51,2019-01-02 04:24:00,1900-01-01 04:21:00,2,2019-01-02,4,2019,1,8.6615,0.0
2,134360,1900-01-01 04:26:00,1900-01-01 04:25:00,354,1,35.056167,-85.268713,4,0,0,...,2019-01-02 04:26:00,2019-01-02 04:25:00,1900-01-01 04:26:00,2,2019-01-02,4,2019,1,8.64,0.0
3,134313,1900-01-01 04:28:22,1900-01-01 04:27:00,284,5,35.052515,-85.302427,4,0,0,...,2019-01-02 04:28:22,2019-01-02 04:27:00,1900-01-01 04:21:00,2,2019-01-02,4,2019,1,8.616333,0.0
4,134313,1900-01-01 04:28:57,1900-01-01 04:27:00,285,6,35.053513,-85.305272,4,0,0,...,2019-01-02 04:28:57,2019-01-02 04:27:00,1900-01-01 04:21:00,2,2019-01-02,4,2019,1,8.6105,0.0


In [7]:
carta.shape

(9705963, 25)

In [8]:
carta.dtypes

trip_id                             int64
scheduled_arrival_time     datetime64[ns]
actual_arrival_time        datetime64[ns]
stop_id                            object
stop_sequence                       int64
stop_lat                          float64
stop_lon                          float64
route_id                           object
direction_id                       object
board_count                         int64
alight_count                        int64
occupancy                           int64
direction_desc                     object
service_period                     object
date                       datetime64[ns]
scheduled_datetime         datetime64[ns]
actual_arrival_datetime            object
trip_start_time            datetime64[ns]
day_of_week                         int64
trip_date                  datetime64[ns]
hour                                int64
year                                int64
month                               int64
Estimated_Temp                    

In [9]:
min(carta.scheduled_datetime)

Timestamp('2019-01-02 04:21:00')

## Subordinate Functions

In [10]:
def normalizer(x):
    
    x_min = min(x)
    x_max = max(x)
    x_norm = (x - x_min)/(x_max - x_min)
    
    return(x_norm)

In [11]:
def standardizer(x):
    x_std = x.std()
    x_mean = x.mean()
    x_standard = (x - x_mean)/x_std
    
    return(x_standard)

## Surrounding Board or Aligh Counts

In [15]:
from shapely.geometry import Point, MultiPoint
from shapely.ops import nearest_points
import geopandas as gpd
from geopy import Point
from geopy import distance

In [16]:
Bus_Stops = pd.read_csv('Bus_Stops.csv')
Bus_Stops.stop_id = Bus_Stops['stop_id'].astype(str)
Bus_Stops.head()

Unnamed: 0,stop_id,stop_lon,stop_lat
0,971,-85.246812,35.024347
1,146,-85.30462,34.989585
2,1545,-85.250863,35.026032
3,972,-85.248528,35.025717
4,81,-85.30489,34.990302


### Radial Influence

In [17]:
def radial_influence(st, radius, DT):
    n_row = DT.shape[0]
    
    stopd_index = DT[DT['stop_id'] == st].index[0]
    x0 = DT.stop_lon.iloc[stopd_index]
    y0 = DT.stop_lat.iloc[stopd_index]
    
    radial_dist = []
    condition = []
    
    for i in range(n_row):
        
        center_point = (x0, y0)
        test_point = (DT.stop_lon.iloc[i], DT.stop_lat.iloc[i])
        dist = distance.distance(center_point, test_point).miles
        
        radial_dist.append(dist)
        
        if (dist <= radius):
            
            cond = 'inside'
            
        else:
            
            cond = 'outside'
            
        condition.append(cond)
        
    influence = {'stop_id': DT.stop_id, 'Radial_Distance': radial_dist, 'Influence': condition}
    Influence = pd.DataFrame(data = influence, columns = ['stop_id', 'Radial_Distance', 'Influence'])
    
    return(Influence)

## Data Extraction

In [25]:
def data_extraction(route, direction, bus_stop, DTFRM):
    import numpy as np
    import pandas as pd
    import datetime as dt
             
    dtfrm = DTFRM[(DTFRM['route_id'] == route) & (DTFRM['direction_id'] == direction)]
    
    n_rows = DTFRM[(DTFRM['route_id'] == route) & (DTFRM['direction_id'] == direction) & (DTFRM['stop_id'] == bus_stop)].shape[0]
    
    if (n_rows == 0):
        print('There are not rows in the data set with the required characteristics. Please change them.')
        
    else:
          
        vinit = DTFRM[(DTFRM['route_id'] == route) & (DTFRM['direction_id'] == direction) & (DTFRM['stop_id'] == bus_stop)][['month', 'service_kind', 'hour', 'Estimated_temp', 'Estimated_precip']].groupby(['month', 'service_kind', 'hour']).mean()
        Vinit = vinit.reset_index(level = ['month', 'service_kind', 'hour'])
        Vinit.columns = ['month', 'service_kind', 'hour', 'mean_temp', 'mean_precip']
    
        V1 = dtfrm[(dtfrm['stop_id'] == bus_stop)][[ 'month', 'service_kind', 'hour', 'board_count']]
        V1 = V1.groupby(['month', 'service_kind', 'hour', 'board_count', 'alight_count']).count()
        V1 = V1.reset_index(level = ['month', 'service_kind', 'hour', 'board_count', 'alight_count'])
        V1.columns = ['month', 'service_kind', 'hour', 'board_count', 'alight_count']
     
        V1 = V1.merge(vinit, on = ['month', 'service_kind', 'hour'], how = 'left')
    
        Relevant_Bus_Stops = radial_influence(bus_stop, 0.5, Bus_Stops)
        bus_stops_ids = Relevant_Bus_Stops.stop_id.unique().astype(str)
    
        V2 = DTFRM[DTFRM['stop_id'].isin(bus_stops_ids)]
    
        V3 = DTFRM[DTFRM['stop_id'].isin(bus_stops_ids)][['month', 'service_kind', 'hour', 'board_count', 'alight_count']]
        V3a = V3.groupby(['month', 'service_kind', 'hour', 'board_count', 'alight_count']).count()
        V3b = V3a.reset_index(level = ['month', 'service_kind', 'hour', 'board_count', 'alight_count'])
        V3b.columns = ['month', 'service_kind', 'hour', 'surrounding_board_count', 'surrounding_alight_count']
    
        V = V1.merge(V3b, on = [ 'month', 'service_kind', 'hour'])
        
        V.month = V['month'].astype('category')
        V.service_kind = V['service_kind'].astype('category')
        V.hour = V['hour'].astype('category')
    
#        if (transf == 'Normalization'):
#                   
#            V.board_count = normalizer(V['board_count'])
#            V.alight_count = normalizer(V['alight_count'])
#            V.surrounding_board_count = normalizer(V['surrounding_board_count'])
#            V.surrounding_alight_count = normalizer(V['surrounding_alight_count'])
#            V.temp = normalizer(V['temp'])
#            V.precip = normalizer(V['precip'])
#        
#            return(V)
#        
#        elif (transf == 'Standardize'):
#        
#            V.board_count = standardizer(V['board_count'])
#            V.alight_count = standardizer(V['alight_count'])
#            V.surrounding_board_count = standardizer(V['surrounding_board_count'])
#            V.surrounding_alight_count = standardizer(V['surrounding_alight_count'])
#            V.temp = standardizer(V['temp'])
#            V.precip = standardizer(V['precip'])
#        
#            return(V)
#    
#        elif (transf == 'features'):
            
        V.surrounding_board_count = normalizer(V['surrounding_board_count'])
        V.surrounding_alight_count = normalizer(V['surrounding_alight_count'])
        V.mean_temp = normalizer(V['mean_temp'])
        V.mean_precip = normalizer(V['mean_precip'])
        
        return(V)
        
        
  

In [26]:
XDTX = data_extraction('4', '1', '12', carta)
#head(XDTX)

KeyError: "['service_kind', 'Estimated_precip', 'Estimated_temp'] not in index"

In [22]:
carta[(carta['route_id'] == '4') & (carta['direction_id'] == '1') & (carta['stop_id'] == '12')]

Unnamed: 0,trip_id,scheduled_arrival_time,actual_arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,...,scheduled_datetime,actual_arrival_datetime,trip_start_time,day_of_week,trip_date,hour,year,month,Estimated_Temp,Estimated_Precip
7312,134370,1900-01-01 09:15:32,1900-01-01 09:10:00,12,87,35.050703,-85.309532,4,1,0,...,2019-01-02 09:15:32,2019-01-02 09:10:00,1900-01-01 08:15:00,2,2019-01-02,9,2019,1,8.584778,0.000000
7845,134379,1900-01-01 09:30:32,1900-01-01 09:31:00,12,87,35.050703,-85.309532,4,1,0,...,2019-01-02 09:30:32,2019-01-02 09:31:00,1900-01-01 08:30:00,2,2019-01-02,9,2019,1,8.859778,0.000000
9213,134365,1900-01-01 10:15:32,1900-01-01 10:19:00,12,87,35.050703,-85.309532,4,1,1,...,2019-01-02 10:15:32,2019-01-02 10:19:00,1900-01-01 09:15:00,2,2019-01-02,10,2019,1,9.400000,0.000000
14579,134390,1900-01-01 13:25:32,1900-01-01 13:26:00,12,87,35.050703,-85.309532,4,1,0,...,2019-01-02 13:25:32,2019-01-02 13:26:00,1900-01-01 12:20:00,2,2019-01-02,13,2019,1,9.400000,0.212778
15330,134388,1900-01-01 13:55:32,1900-01-01 13:51:00,12,87,35.050703,-85.309532,4,1,0,...,2019-01-02 13:55:32,2019-01-02 13:51:00,1900-01-01 12:50:00,2,2019-01-02,13,2019,1,9.400000,0.462778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9703609,150566,1900-01-01 15:29:08,1900-01-01 15:27:00,12,87,35.050703,-85.309532,4,1,0,...,2020-05-31 15:29:08,2020-05-31 15:27:00,1900-01-01 14:25:00,6,2020-05-31,15,2020,5,26.942778,0.000000
9704256,150567,1900-01-01 16:40:32,1900-01-01 16:41:00,12,88,35.050703,-85.309532,4,1,0,...,2020-05-31 16:40:32,2020-05-31 16:41:00,1900-01-01 15:35:00,6,2020-05-31,16,2020,5,27.200000,0.000000
9705133,150570,1900-01-01 18:29:08,1900-01-01 18:38:00,12,87,35.050703,-85.309532,4,1,4,...,2020-05-31 18:29:08,2020-05-31 18:38:00,1900-01-01 17:15:00,6,2020-05-31,18,2020,5,26.957222,0.000000
9705448,139858,1900-01-01 19:09:11,1900-01-01 19:05:00,12,87,35.050703,-85.309532,4,1,0,...,2020-05-31 19:09:11,2020-05-31 19:05:00,1900-01-01 18:05:00,6,2020-05-31,19,2020,5,26.531639,0.000000
