In [2]:
import sys
print("Python Version:", sys.version)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
pd.set_option('display.max_rows', 500)
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

Python Version: 3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


In [3]:
# yymmdd format
dates = ['190803',
        '190810',
        '190817',
        '190824',
        '190831',
        '190907',
        '190914',
        '190921',
        '190928']

# iterate through dates and pull in csv and cat dataframes together
df_turns = []
for date in dates:
    df_turns.append(pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_'+date+'.txt'))


#Concat all data frames    
df_turns = pd.concat(df_turns)
#Clean up column names
new_col_names = [name.strip() for name in df_turns.columns]
df_turns.columns = new_col_names
# Remove audited rows since many are duplicates
df_turns=df_turns[df_turns.DESC!='RECOVR AUD']
# Remove column now that all are regular
del df_turns['DESC']
#Convert Date column to proper date type.
df_turns['DATE']  = pd.to_datetime(df_turns['DATE'],format='%m/%d/%Y')
#convert time to datetime type
df_turns['TIME']=pd.to_datetime(df_turns['TIME'], format='%H:%M:%S')
# Caluclate Entries Aggregated at station level by DateTime
df_turns['Entry_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['ENTRIES'].transform(pd.Series.diff)['ENTRIES']
# Caluclate Exits Aggregated at station level by DateTime
df_turns['Exit_Diff']=df_turns.groupby(['STATION', 'C/A', 'UNIT', 'SCP'],as_index=False)['EXITS'].transform(pd.Series.diff)['EXITS']
#Absolute Value to deal with counting backward issues 
df_turns['Entry_Diff'] = abs(df_turns['Entry_Diff'])
df_turns['Exit_Diff']=abs(df_turns['Exit_Diff'])
# Calculate both
df_turns['Total_Traffic']=df_turns['Entry_Diff']+df_turns['Exit_Diff']

In [4]:
def convertTimeBuckets(time):
    
    """
    00:00 < Late Night <= 4:00
    4:00 < Early Morning <= 8:00
    8:00 < Morning <= 12:00
    12:00 < Afternoon <= 16:00
    16:00 < Evening  <= 20:00
    20:00 < Late Night <= 00:00
    """
    
    hour = time.hour
    if hour > 20 or hour == 0:
        category = 'Late Night'
    elif hour > 16:
        category = 'Evening'
    elif hour > 12:
        category = 'Afternoon'
    elif hour > 8:
        category = 'Morning'
    elif hour > 4:
        category = 'Early Morning'
    elif hour > 0:
        category = 'Late Night'
    
    return category


def daytype(day):
    if day == 'Sat' or day == 'Sun':
        return 'Weekend'
    else:
        return 'Weekday'
def Add_Weekday(data_frame, column='Date'):
    dmap = {0:'Mon', 1: 'Tue', 2: 'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
    #Convert column to date
    data_frame[column] = pd.to_datetime(data_frame[column])
    #Add column which shows the Weekday in integer
    data_frame['Day_Number'] = data_frame[column].apply(lambda x: x.dayofweek)
    #Add column which shows the Weekday in words
    data_frame['Weekday'] = data_frame['Day_Number'].map(dmap)
    return data_frame
df_turns = Add_Weekday(df_turns,'DATE')
df_turns['Day_Bucket'] = df_turns['TIME'].apply(convertTimeBuckets)
df_turns['Day_Type'] = df_turns['Weekday'].apply(daytype)
df_turns['Day_Bucket'] = pd.Categorical(df_turns['Day_Bucket'], categories=
    ['Early Morning','Morning','Afternoon','Evening','Late Night'],
    ordered=True)