In [1]:
import pandas as pd
import requests
import pickle
from datetime import datetime as dt

In [2]:
def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)

In [3]:
#Rutine to clean and format the MTA data
def clean_n_format(turns_df, upper_lim):
    turns_df.rename(columns={column:column.strip() for column in turns_df.columns}, inplace=True)
    turns_stations = turns_df 
    #Getting entries and exits per time interval
    turns_stations[["REAL_ENTRIES", "REAL_EXITS"]] = (turns_stations.groupby(["C/A", "UNIT", "SCP", "STATION","DATE"])
                                                  ["ENTRIES", "EXITS"]
                                                  .transform(lambda x: x - x.shift(1)))
    #Droping negative entries and exits
    turns_stations = (turns_stations.drop(turns_stations
                    [(turns_stations["REAL_EXITS"] < 0) | (turns_stations["REAL_ENTRIES"] < 0)].index))
    turns_stations["TOTAL_REAL"] = turns_stations["REAL_ENTRIES"] + turns_stations["REAL_EXITS"]
    turns_stations = (turns_stations.drop(turns_stations[turns_stations["TOTAL_REAL"] > upper_lim].index))
    #Adding up all entries+exits for each station at a given date and time interval 
    turns_stations_day_time = ((turns_stations.groupby(["STATION","DATE","TIME"])).sum().sort_values("TOTAL_REAL", ascending = False)
                           .reset_index())
    return(turns_stations_day_time)

In [4]:
#Break it up into morning + afternoon chunks
def get_time_chunk(date_time_object):
    dow = date_time_object.strftime('%A')
    noon = date_time_object.replace(hour=12, minute=0, second=0)
    if date_time_object.time() < noon.time() :
        dow += ' Morning'
    else :
        dow += ' Afternoon'
    return dow

In [None]:
week_ids = [180616, 180609, 180602,180526, 170617, 170610, 170603,170527, 160618, 160611, 160604,160528]
#week_ids = [170624, 180609]

In [None]:
turns_df = get_data(week_ids)

In [None]:
turns_df_2 = clean_n_format(turns_df, 100000)

In [None]:
#Adding DATE_TIME column
turns_df_2["DATE_TIME"] = pd.to_datetime(turns_df_2.DATE + " " + turns_df_2.TIME, format="%m/%d/%Y %H:%M:%S")

In [None]:
#Break it up into morning + afternoon chunks
turns_df_2['DOW'] = turns_df_2['DATE_TIME'].apply(get_time_chunk)
turns_df_2.head()

In [None]:
turns_stations_timechunks = ((turns_df_2.groupby(["STATION","DOW"])).sum().reset_index())
turns_stations_timechunks.sort_values("TOTAL_REAL", ascending = False).head()

In [None]:
turns_stations_timechunks.to_csv("MTA_16_18.csv")