In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns             
import datetime
pd.set_option('display.max_rows', 500)
%matplotlib inline

In [None]:
# Source: http://web.mta.info/developers/turnstile.html
def get_data(week_nums):
    dfs = []
    for week_num in week_nums:
        file="~/mta_turnstile_data/turnstile_"+str(week_num)+".txt"
        dfs.append(pd.read_csv(file))
    return pd.concat(dfs)

week_nums_18 = [180519,180526,180602,180609,180616,180623,180630,180707,180714,180721,180728,180804]
week_nums_19 = [190518,190525,190601,190608,190615,190622,190629,190706,190713,190720,190727,190803]
week_nums_21 = [210515,210522,210529,210605,210612,210619,210626,210703,210710,210717,210724,210731]
df18 = get_data(week_nums_18)
df19 = get_data(week_nums_19)
df21 = get_data(week_nums_21)

In [None]:
# Data cleaning
def clean_data(df):
    df.columns = df.columns.str.replace(' ', '') #removed spaces rename column
    station_mask = (
                (df["STATION"] == "ATLANTIC AV") |
                (df["STATION"] == "FLATBUSH AV-B.C")|
                (df["STATION"] == "ATL AV-BARCLAY")|
                (df["STATION"] == "25 AV")|
                (df["STATION"] == "BERGEN ST")|
                (df["STATION"] == "FULTON ST")|
                (df["STATION"] == "LAFAYETTE AV")
               )
    ss=df[station_mask]
    ss=ss.drop(columns=['LINENAME','DIVISION','DESC']) #droped unwated columns
    ss["TURNSTILE"]=ss[ "C/A"]+ss["UNIT"]+ss["SCP"]+ss["STATION"]
    
    return(ss)

In [None]:
df18=clean_data(df18)

In [None]:
df19=clean_data(df19)

In [None]:
df21=clean_data(df21)

In [None]:
def get_daily_counts(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        counter = -counter
    if counter > max_counter:
        print(row["ENTRIES"], row["PREV_ENTRIES"])
        return 0
    if counter > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        return 0
    return counter

# If counter is > 1Million, then the counter might have been reset.  
# Just set it to zero as different counters have different cycle limits

In [None]:
def calculate_daily_entries(df):
    ss = (df.groupby(["TURNSTILE","DATE","STATION"],as_index=False).ENTRIES.first())
    ss[["PREV_DATE", "PREV_ENTRIES"]] = (ss.groupby(["TURNSTILE"])["DATE", "ENTRIES"].apply(lambda grp: grp.shift(1)))
    # Drop the rows for the earliest date in the df
    ss.dropna(subset=["PREV_DATE"], axis=0, inplace=True)
    ss["DAILY_ENTRIES"]= ss.apply(get_daily_counts, axis=1, max_counter=100000)
    ss["DATE_TIME"]=pd.to_datetime(ss["DATE"])
    ss["WEEK"]=ss["DATE_TIME"].dt.isocalendar().week
    ss["DAY"]=ss["DATE_TIME"].dt.dayofweek
    MASK=((ss["DAY"] == 0) |
      (ss["DAY"] == 1) |
      (ss["DAY"] == 2) |
      (ss["DAY"] == 3) |
      (ss["DAY"] == 4) 
     )
    ss=ss[MASK]
    VOLUME_PER_STATION=ss.groupby(["WEEK","STATION"])["DAILY_ENTRIES"].sum().reset_index()
    VOLUME_PER_WEEK=ss.groupby(["WEEK"])["DAILY_ENTRIES"].sum().reset_index()
    return(VOLUME_PER_STATION,VOLUME_PER_WEEK)

In [None]:
VOLUME_PER_STATION19,VOLUME_PER_WEEK19=calculate_daily_entries(df19)


In [None]:
VOLUME_PER_STATION21,VOLUME_PER_WEEK21=calculate_daily_entries(df21)

In [None]:
VOLUME_PER_WEEK21["YEAR"]="2021"
VOLUME_PER_STATION21["YEAR"]="2021"

In [None]:
VOLUME_PER_WEEK19["YEAR"]="2019"
VOLUME_PER_STATION19["YEAR"]="2019"

In [None]:
VOLUME_PER_WEEK=pd.concat([VOLUME_PER_WEEK19,VOLUME_PER_WEEK21])

In [None]:
VOLUME_PER_STATION=pd.concat([VOLUME_PER_STATION19,VOLUME_PER_STATION21])

In [None]:
# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline 

In [None]:
week_barplot=plt.figure()
plt.figure(figsize=(16, 8))
plt.title("Top 7 Stations in NYC by Mean Entries per Day", fontsize=18)
week_barplot=sns.barplot(x='WEEK',y='DAILY_ENTRIES',hue='YEAR',data=VOLUME_PER_WEEK)
plt.show()

In [None]:
sns.catplot(x='WEEK',y='DAILY_ENTRIES',hue='STATION',kind="point",data=VOLUME_PER_STATION19)