In [1]:
from __future__ import print_function, division

In [2]:
import pandas as pd
from datetime import datetime
import numpy as np

%matplotlib inline

In [3]:
import datetime

In [4]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [190629, 190622, 190615]
df = get_data(week_nums)

In [5]:
def get_daily_counts(row, max_counter,type_exit):
    counter = row[type_exit] - row["PREV_"+type_exit]
    if counter < 0:
        counter = -counter
    if counter > max_counter:
        counter = min(row[type_exit], row["PREV_"+type_exit])
    if counter > max_counter:
        return 0
    return counter

In [6]:
def clean_data(df):
    
    #Clean column names
    df.columns = df.columns.str.strip()
    
    #Add datetime
    dti = pd.to_datetime(df['DATE']+' '+df['TIME'],format="%m/%d/%Y %H:%M:%S")
    df['DATETIME'] = dti
    df.head()
    
    #Get rid of duplicate entries
    df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATETIME"],inplace=True, ascending=False)
    df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATETIME"], inplace=True)
    
    #Drop DESC column
    turnstiles_df1 = df.drop(["DESC"], axis=1, errors="ignore")
    
    #Counting Entries and Exits
    turnstiles_daily_time = (turnstiles_df1.groupby(["C/A", "UNIT", "SCP", "STATION", "DATETIME"],as_index=False)['ENTRIES','EXITS'].first())
    turnstiles_daily_time[["PREV_DATE", "PREV_ENTRIES", "PREV_EXITS"]] = (turnstiles_daily_time
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATETIME","ENTRIES","EXITS"]
                                                       .transform(lambda grp: grp.shift(1)))
    turnstiles_daily_time.dropna(subset=["PREV_DATE"], axis=0, inplace=True)
    # If counter is > 1Million, then the counter might have been reset.  
    # Just set it to zero as different counters have different cycle limits
    turnstiles_daily_time["HOURLY_ENTRIES"] = turnstiles_daily_time.apply(get_daily_counts, axis=1, max_counter=10000, type_exit ='ENTRIES')
    turnstiles_daily_time["HOURLY_EXITS"] = turnstiles_daily_time.apply(get_daily_counts,axis = 1, max_counter = 10000, type_exit ='EXITS')
    
    #Clean '0' from Stations_Daily_Time
    turnstiles_daily_time['HOURLY_ENTRIES'].replace(0, np.nan, inplace=True)
    turnstiles_daily_time['HOURLY_EXITS'].replace(0,np.nan,inplace=True) 
    
    #Creating a Total Count Columns
    turnstiles_daily_time['TOTAL_COUNT'] = turnstiles_daily_time['HOURLY_ENTRIES']+turnstiles_daily_time['HOURLY_EXITS']
    
    #Cleaning up columns in turnstiles_daily
    turnstiles_daily_clean = turnstiles_daily_time.drop(["PREV_ENTRIES", "PREV_EXITS", "PREV_DATE", "EXITS", "ENTRIES"], axis=1, errors="ignore")
    
    #Dropping Lackawanna
    turnstiles_daily_clean = turnstiles_daily_clean[turnstiles_daily_clean.STATION != 'LACKAWANNA']
    
    return turnstiles_daily_clean
    

In [7]:
truck_data = clean_data(df)

In [8]:
def timebin(element):
    if element >=6 and element <= 11:
        return "breakfast"
    elif element >11 and element <=16:
        return "lunch"
    elif element > 16 and element <=23:
        return "dinner"
    else: 
        return "midnight snack"

truck_data['hours']= truck_data['DATETIME'].dt.hour.apply(timebin)

In [9]:
truck_data.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,HOURLY_ENTRIES,HOURLY_EXITS,TOTAL_COUNT,hours
1,A002,R051,02-00-00,59 ST,2019-06-08 04:00:00,28.0,6.0,34.0,midnight snack
2,A002,R051,02-00-00,59 ST,2019-06-08 08:00:00,20.0,43.0,63.0,breakfast
3,A002,R051,02-00-00,59 ST,2019-06-08 12:00:00,87.0,82.0,169.0,lunch
4,A002,R051,02-00-00,59 ST,2019-06-08 16:00:00,203.0,57.0,260.0,lunch
5,A002,R051,02-00-00,59 ST,2019-06-08 20:00:00,314.0,32.0,346.0,dinner


In [11]:
df1 = truck_data.groupby(['STATION', 'hours']).sum()
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,HOURLY_ENTRIES,HOURLY_EXITS,TOTAL_COUNT
STATION,hours,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 AV,breakfast,24020.0,46924.0,69047.0
1 AV,dinner,96752.0,107796.0,202814.0
1 AV,lunch,143937.0,150653.0,290664.0
1 AV,midnight snack,50543.0,48865.0,97650.0
103 ST,breakfast,119167.0,58953.0,177913.0
103 ST,dinner,252935.0,198992.0,451580.0
103 ST,lunch,121023.0,66945.0,187844.0
103 ST,midnight snack,28187.0,47704.0,75594.0
103 ST-CORONA,breakfast,124914.0,12248.0,137068.0
103 ST-CORONA,dinner,56302.0,105111.0,161413.0
