In [20]:
from __future__ import print_function, division

In [21]:
import pandas as pd
from datetime import datetime
import numpy as np

%matplotlib inline

In [22]:
import datetime

In [23]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [190629, 190622, 190615]
df = get_data(week_nums)

In [24]:
def get_daily_counts(row, max_counter,type_exit):
    counter = row[type_exit] - row["PREV_"+type_exit]
    if counter < 0:
        counter = -counter
    if counter > max_counter:
        counter = min(row[type_exit], row["PREV_"+type_exit])
    if counter > max_counter:
        return 0
    return counter

In [28]:
def clean_data(df):
    
    #Clean column names
    df.columns = df.columns.str.strip()
    
    #Add datetime
    dti = pd.to_datetime(df['DATE']+' '+df['TIME'],format="%m/%d/%Y %H:%M:%S")
    df['DATETIME'] = dti
    df.head()
    
    #Get rid of duplicate entries
    df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATETIME"],inplace=True, ascending=False)
    df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATETIME"], inplace=True)
    
    #Drop DESC column
    turnstiles_df1 = df.drop(["DESC"], axis=1, errors="ignore")
    
    #Counting Entries and Exits
    turnstiles_daily_time = (turnstiles_df1.groupby(["C/A", "UNIT", "SCP", "STATION", "DATETIME"],as_index=False)['ENTRIES','EXITS'].first())
    turnstiles_daily_time[["PREV_DATE", "PREV_ENTRIES", "PREV_EXITS"]] = (turnstiles_daily_time
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATETIME","ENTRIES","EXITS"]
                                                       .transform(lambda grp: grp.shift(1)))
    turnstiles_daily_time.dropna(subset=["PREV_DATE"], axis=0, inplace=True)
    # If counter is > 1Million, then the counter might have been reset.  
    # Just set it to zero as different counters have different cycle limits
    turnstiles_daily_time["HOURLY_ENTRIES"] = turnstiles_daily_time.apply(get_daily_counts, axis=1, max_counter=10000, type_exit ='ENTRIES')
    turnstiles_daily_time["HOURLY_EXITS"] = turnstiles_daily_time.apply(get_daily_counts,axis = 1, max_counter = 10000, type_exit ='EXITS')
    
    #Clean '0' from Stations_Daily_Time
    turnstiles_daily_time['HOURLY_ENTRIES'].replace(0, np.nan, inplace=True)
    turnstiles_daily_time['HOURLY_EXITS'].replace(0,np.nan,inplace=True) 
    
    #Creating a Total Count Columns
    turnstiles_daily_time['TOTAL_COUNT'] = turnstiles_daily_time['HOURLY_ENTRIES']+turnstiles_daily_time['HOURLY_EXITS']
    
    #Cleaning up columns in turnstiles_daily
    turnstiles_daily_clean = turnstiles_daily_time.drop(["PREV_ENTRIES", "PREV_EXITS", "PREV_DATE", "EXITS", "ENTRIES"], axis=1, errors="ignore")
    
    #Dropping Lackawanna
    turnstiles_daily_clean = turnstiles_daily_clean[turnstiles_daily_clean.STATION != 'LACKAWANNA']
    
    return turnstiles_daily_clean
    

In [29]:
truck_data = clean_data(df)