In [1]:
from __future__ import print_function, division

In [2]:
import pandas as pd
from datetime import datetime
from collections import defaultdict
import numpy as np
import seaborn as sns
%matplotlib inline

In [3]:
import datetime

In [4]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)
        
week_nums = [190629, 190622, 190615]
df = get_data(week_nums)

In [5]:
def get_daily_counts(row, max_counter,type_exit):
    counter = row[type_exit] - row["PREV_"+type_exit]
    if counter < 0:
        counter = -counter
    if counter > max_counter:
        counter = min(row[type_exit], row["PREV_"+type_exit])
    if counter > max_counter:
        return 0
    return counter

In [6]:
def clean_data(df):
    
    #Clean column names
    df.columns = df.columns.str.strip()
    
    #Add datetime
    df['DATETIME'] = pd.to_datetime(df['DATE']+' '+df['TIME'],format="%m/%d/%Y %H:%M:%S")
    
    df.head()
    
    #Add DAY_OF_WEEK column
    #df['DAY_OF_WEEK'] = pd.to_datetime(df["DATETIME"], errors='coerce').dt.weekday_name
    
    #Get rid of duplicate entries
    df.sort_values(["C/A", "UNIT", "SCP", "STATION", "DATETIME"],inplace=True, ascending=False)
    df.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATETIME"], inplace=True)
    
    #Drop DESC column
    turnstiles_df1 = df.drop(["DESC"], axis=1, errors="ignore")
    
    #Counting Entries and Exits
    turnstiles_daily_time = (turnstiles_df1.groupby(["C/A", "UNIT", "SCP", "STATION", "DATETIME","LINENAME"],as_index=False)['ENTRIES','EXITS'].first())
    turnstiles_daily_time[["PREV_DATE", "PREV_ENTRIES", "PREV_EXITS"]] = (turnstiles_daily_time
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION","LINENAME"])["DATETIME","ENTRIES","EXITS"]
                                                       .transform(lambda grp: grp.shift(1)))
    turnstiles_daily_time.dropna(subset=["PREV_DATE"], axis=0, inplace=True)
    # If counter is > 1Million, then the counter might have been reset.  
    # Just set it to zero as different counters have different cycle limits
    turnstiles_daily_time["HOURLY_ENTRIES"] = turnstiles_daily_time.apply(get_daily_counts, axis=1, max_counter=10000, type_exit ='ENTRIES')
    turnstiles_daily_time["HOURLY_EXITS"] = turnstiles_daily_time.apply(get_daily_counts,axis = 1, max_counter = 10000, type_exit ='EXITS')
    
    #Clean '0' from Stations_Daily_Time
    turnstiles_daily_time['HOURLY_ENTRIES'].replace(0, np.nan, inplace=True)
    turnstiles_daily_time['HOURLY_EXITS'].replace(0,np.nan,inplace=True) 
    
    #Creating a Total Count Columns
    turnstiles_daily_time['TOTAL_COUNT'] = turnstiles_daily_time['HOURLY_ENTRIES']+turnstiles_daily_time['HOURLY_EXITS']
    
    #Cleaning up columns in turnstiles_daily
    turnstiles_daily_clean = turnstiles_daily_time.drop(["PREV_ENTRIES", "PREV_EXITS", "PREV_DATE", "EXITS", "ENTRIES"], axis=1, errors="ignore")
    
    #Dropping Stations outside of Manhattan
    turnstiles_daily_clean = turnstiles_daily_clean[(turnstiles_daily_clean.STATION != 'LACKAWANNA')&(turnstiles_daily_clean.STATION != 'KEW GARDENS')&
                                                    (turnstiles_daily_clean.STATION != 'JKSN HT-ROOSVLT')&(turnstiles_daily_clean.STATION != 'FLUSHING-MAIN')&
                                                   (turnstiles_daily_clean.STATION != 'JOURNAL SQUARE') & (turnstiles_daily_clean.STATION != 'JUNCTION BLVD')&
                                                   (turnstiles_daily_clean.STATION != 'CONEY IS-STILLW')]
    
    return turnstiles_daily_clean
    

In [7]:
truck_data = clean_data(df)

In [8]:
truck_data['DAY_OF_WEEK'] = pd.to_datetime(truck_data["DATETIME"], errors='coerce').dt.weekday_name

In [9]:
def timebin(element):
    if element >=6 and element <= 11:
        return "Breakfast"
    elif element >11 and element <=16:
        return "Lunch"
    elif element > 16 and element <=23:
        return "Dinner"
    else: 
        return "Late Night"

truck_data['HOURS']= truck_data['DATETIME'].dt.hour.apply(timebin)

In [10]:
def weekend(day):
    if day == 'Saturday' or day == 'Sunday':
        return 'Weekend'
    else:
        return 'Workweek'
truck_data['WEEK'] = truck_data['DAY_OF_WEEK'].apply(weekend)

In [11]:
truck_data.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,LINENAME,HOURLY_ENTRIES,HOURLY_EXITS,TOTAL_COUNT,DAY_OF_WEEK,HOURS,WEEK
1,A002,R051,02-00-00,59 ST,2019-06-08 04:00:00,NQR456W,28.0,6.0,34.0,Saturday,Late Night,Weekend
2,A002,R051,02-00-00,59 ST,2019-06-08 08:00:00,NQR456W,20.0,43.0,63.0,Saturday,Breakfast,Weekend
3,A002,R051,02-00-00,59 ST,2019-06-08 12:00:00,NQR456W,87.0,82.0,169.0,Saturday,Lunch,Weekend
4,A002,R051,02-00-00,59 ST,2019-06-08 16:00:00,NQR456W,203.0,57.0,260.0,Saturday,Lunch,Weekend
5,A002,R051,02-00-00,59 ST,2019-06-08 20:00:00,NQR456W,314.0,32.0,346.0,Saturday,Dinner,Weekend


In [61]:
def getranking(df,positions, weektype, count = 0):
    #Create dictionary
    count = count
    df1 = (df.groupby(['STATION','LINENAME', 'WEEK', 'HOURS']).sum().reset_index())
    topx_dict = defaultdict(str)
    hours = ['Breakfast','Lunch','Dinner','Late Night']
    for k in hours:
        topx_dict[k] = (df1[(df1['WEEK'] == weektype)&(df1['HOURS'] == k)].reset_index().sort_values(by='TOTAL_COUNT',ascending=False).head(positions))
    #Replace NaN for top stations in dictionary
    if count == 0:
        df2 = filldata(df,topx_dict)
        topx_dict = getranking(df2,positions,weektype,count = 1)
    if count == 1:
        for k in hours:
            topx_dict[k] = topx_dict[k].drop(["index"], axis=1, errors="ignore")
    return topx_dict

In [62]:
#Fill Nan or 0's in top stations
def filldata(df, dictionary):
    for key in dictionary:
        length = len(dictionary[key])
    for k in range(length):
        station = dictionary[key].STATION
        df[(df.STATION == str(station))].fillna(df[(df.STATION == str(station))].TOTAL_COUNT.mean(), inplace=True)
    return df

In [67]:
#Get Top10
top10_weekend = getranking(truck_data,10,weektype ='Weekend')

In [69]:
top10_weekend['Lunch']

Unnamed: 0,STATION,LINENAME,WEEK,HOURS,HOURLY_ENTRIES,HOURLY_EXITS,TOTAL_COUNT
90,34 ST-HERALD SQ,BDFMNQRW,Weekend,Lunch,158180.0,192998.0,350967.0
310,GRD CNTRL-42 ST,4567S,Weekend,Lunch,121292.0,125728.0,246250.0
94,34 ST-PENN STA,ACE,Weekend,Lunch,111524.0,112104.0,221954.0
103,42 ST-PORT AUTH,ACENQRS1237W,Weekend,Lunch,116011.0,98225.0,214218.0
441,TIMES SQ-42 ST,1237ACENQRSW,Weekend,Lunch,69186.0,112870.0,181331.0
126,59 ST COLUMBUS,ABCD1,Weekend,Lunch,81100.0,96512.0,177593.0
157,86 ST,456,Weekend,Lunch,62228.0,84716.0,146944.0
122,59 ST,456NQRW,Weekend,Lunch,44919.0,71071.0,115216.0
19,125 ST,ACBD,Weekend,Lunch,59624.0,51610.0,111221.0
107,47-50 STS ROCK,BDFM,Weekend,Lunch,37034.0,62581.0,99534.0
