In [2]:
import pandas as pd
pd.set_option('display.max_rows', 1000)


In [3]:
# read in october 2019 mta data
df1 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_191026.txt')
df2 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_191019.txt')
df3 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_191012.txt')
df4 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_191005.txt')

In [4]:
# combine into one dataframe and rename exits column
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
df = df.rename(columns={'EXITS                                                               ': 'EXITS'})

In [5]:
# we see there are over 800,000 rows
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825876 entries, 0 to 825875
Data columns (total 11 columns):
C/A         825876 non-null object
UNIT        825876 non-null object
SCP         825876 non-null object
STATION     825876 non-null object
LINENAME    825876 non-null object
DIVISION    825876 non-null object
DATE        825876 non-null object
TIME        825876 non-null object
DESC        825876 non-null object
ENTRIES     825876 non-null int64
EXITS       825876 non-null int64
dtypes: int64(2), object(9)
memory usage: 69.3+ MB


In [6]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,00:00:00,REGULAR,7238905,2452500
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,04:00:00,REGULAR,7238924,2452505
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,08:00:00,REGULAR,7238945,2452536
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,12:00:00,REGULAR,7239029,2452602
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,16:00:00,REGULAR,7239280,2452651


In [7]:
# there are 377 different stations
len(df.STATION.unique())

377

In [8]:
# the date col is a string, we want it in datetime form
type(df["DATE"][0])

str

In [9]:
# create a new column that combines the day and time into one and makes it a datetime object
df["DATE_TIME"] =  pd.to_datetime(df["DATE"] +" "+ df["TIME"])

In [10]:
# add in a day of the week column
df["DAY_INT"] = df["DATE_TIME"].dt.dayofweek

In [11]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_INT
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,00:00:00,REGULAR,7238905,2452500,2019-10-19 00:00:00,5
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,04:00:00,REGULAR,7238924,2452505,2019-10-19 04:00:00,5
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,08:00:00,REGULAR,7238945,2452536,2019-10-19 08:00:00,5
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,12:00:00,REGULAR,7239029,2452602,2019-10-19 12:00:00,5
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,16:00:00,REGULAR,7239280,2452651,2019-10-19 16:00:00,5


In [12]:
# create a mapper to map the day of the week nubers to actual string values
day_dict = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

In [13]:
# add that day of the week string column
df["DAY_STR"] = df["DAY_INT"].map(day_dict)
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_INT,DAY_STR
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,00:00:00,REGULAR,7238905,2452500,2019-10-19 00:00:00,5,Saturday
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,04:00:00,REGULAR,7238924,2452505,2019-10-19 04:00:00,5,Saturday
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,08:00:00,REGULAR,7238945,2452536,2019-10-19 08:00:00,5,Saturday
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,12:00:00,REGULAR,7239029,2452602,2019-10-19 12:00:00,5,Saturday
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,16:00:00,REGULAR,7239280,2452651,2019-10-19 16:00:00,5,Saturday


In [14]:
# what the first 1000 values look like
df[:1000]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_INT,DAY_STR
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,00:00:00,REGULAR,7238905,2452500,2019-10-19 00:00:00,5,Saturday
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,04:00:00,REGULAR,7238924,2452505,2019-10-19 04:00:00,5,Saturday
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,08:00:00,REGULAR,7238945,2452536,2019-10-19 08:00:00,5,Saturday
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,12:00:00,REGULAR,7239029,2452602,2019-10-19 12:00:00,5,Saturday
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,16:00:00,REGULAR,7239280,2452651,2019-10-19 16:00:00,5,Saturday
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/19/2019,20:00:00,REGULAR,7239629,2452702,2019-10-19 20:00:00,5,Saturday
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/20/2019,00:00:00,REGULAR,7239774,2452727,2019-10-20 00:00:00,6,Sunday
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/20/2019,04:00:00,REGULAR,7239785,2452734,2019-10-20 04:00:00,6,Sunday
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/20/2019,08:00:00,REGULAR,7239797,2452756,2019-10-20 08:00:00,6,Sunday
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/20/2019,12:00:00,REGULAR,7239863,2452808,2019-10-20 12:00:00,6,Sunday


In [15]:
# sort the dataframe by date
ordered_date_df = df.sort_values(by=["DATE_TIME"])
ordered_date_df[:1000]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_INT,DAY_STR
778943,R237,R046,01-00-05,GRD CNTRL-42 ST,4567S,IRT,09/28/2019,00:00:00,REGULAR,6179894,4636444,2019-09-28,5,Saturday
640964,C012,R258,01-06-01,4AV-9 ST,DFGMNR,BMT,09/28/2019,00:00:00,REGULAR,2724169,1451400,2019-09-28,5,Saturday
749239,R102,R304,01-06-00,RECTOR ST,1,IRT,09/28/2019,00:00:00,REGULAR,47372,3558817,2019-09-28,5,Saturday
629685,A055,R227,00-00-03,RECTOR ST,NRW,BMT,09/28/2019,00:00:00,REGULAR,3265726,1082738,2019-09-28,5,Saturday
804160,R509,R121,00-00-03,QUEENSBORO PLZ,7NQW,IRT,09/28/2019,00:00:00,REGULAR,5958818,1379084,2019-09-28,5,Saturday
685441,N116,R198,00-03-01,NOSTRAND AV,AC,IND,09/28/2019,00:00:00,REGULAR,7286534,3621278,2019-09-28,5,Saturday
814412,R601A,R108,02-00-05,BOROUGH HALL,2345R,IRT,09/28/2019,00:00:00,REGULAR,4213736,6570765,2019-09-28,5,Saturday
633327,B013,R196,01-00-01,PROSPECT PARK,BQS,BMT,09/28/2019,00:00:00,REGULAR,13710455,10097193,2019-09-28,5,Saturday
671028,N057,R188,00-05-00,50 ST,CE,IND,09/28/2019,00:00:00,REGULAR,3539,0,2019-09-28,5,Saturday
779699,R238,R046,00-03-04,GRD CNTRL-42 ST,4567S,IRT,09/28/2019,00:00:00,REGULAR,37660023,9679136,2019-09-28,5,Saturday


In [16]:
"""
group by station, scp, unit, and c/a to get the individual counters 
then take the difference in entries to get entry changes on each timestamp
"""
ordered_date_df['ENTRIES_DIFF']=ordered_date_df.groupby(['STATION', 'SCP','UNIT','C/A'])['ENTRIES'].diff().fillna(0)
ordered_date_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_INT,DAY_STR,ENTRIES_DIFF
778943,R237,R046,01-00-05,GRD CNTRL-42 ST,4567S,IRT,09/28/2019,00:00:00,REGULAR,6179894,4636444,2019-09-28,5,Saturday,0.0
640964,C012,R258,01-06-01,4AV-9 ST,DFGMNR,BMT,09/28/2019,00:00:00,REGULAR,2724169,1451400,2019-09-28,5,Saturday,0.0
749239,R102,R304,01-06-00,RECTOR ST,1,IRT,09/28/2019,00:00:00,REGULAR,47372,3558817,2019-09-28,5,Saturday,0.0
629685,A055,R227,00-00-03,RECTOR ST,NRW,BMT,09/28/2019,00:00:00,REGULAR,3265726,1082738,2019-09-28,5,Saturday,0.0
804160,R509,R121,00-00-03,QUEENSBORO PLZ,7NQW,IRT,09/28/2019,00:00:00,REGULAR,5958818,1379084,2019-09-28,5,Saturday,0.0


In [18]:
"""
group by station, scp, unit, and c/a to get the individual counters 
then take the difference in exits to get exit changes on each timestamp
"""
ordered_date_df['EXIT_DIFF']=ordered_date_df.groupby(['STATION', 'SCP', 'UNIT', 'C/A'])['EXITS'].diff().fillna(0)
ordered_date_df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_INT,DAY_STR,ENTRIES_DIFF,EXIT_DIFF
778943,R237,R046,01-00-05,GRD CNTRL-42 ST,4567S,IRT,09/28/2019,00:00:00,REGULAR,6179894,4636444,2019-09-28,5,Saturday,0.0,0.0
640964,C012,R258,01-06-01,4AV-9 ST,DFGMNR,BMT,09/28/2019,00:00:00,REGULAR,2724169,1451400,2019-09-28,5,Saturday,0.0,0.0
749239,R102,R304,01-06-00,RECTOR ST,1,IRT,09/28/2019,00:00:00,REGULAR,47372,3558817,2019-09-28,5,Saturday,0.0,0.0
629685,A055,R227,00-00-03,RECTOR ST,NRW,BMT,09/28/2019,00:00:00,REGULAR,3265726,1082738,2019-09-28,5,Saturday,0.0,0.0
804160,R509,R121,00-00-03,QUEENSBORO PLZ,7NQW,IRT,09/28/2019,00:00:00,REGULAR,5958818,1379084,2019-09-28,5,Saturday,0.0,0.0


In [19]:
ordered_date_df[:1000]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,DAY_INT,DAY_STR,ENTRIES_DIFF,EXIT_DIFF
778943,R237,R046,01-00-05,GRD CNTRL-42 ST,4567S,IRT,09/28/2019,00:00:00,REGULAR,6179894,4636444,2019-09-28,5,Saturday,0.0,0.0
640964,C012,R258,01-06-01,4AV-9 ST,DFGMNR,BMT,09/28/2019,00:00:00,REGULAR,2724169,1451400,2019-09-28,5,Saturday,0.0,0.0
749239,R102,R304,01-06-00,RECTOR ST,1,IRT,09/28/2019,00:00:00,REGULAR,47372,3558817,2019-09-28,5,Saturday,0.0,0.0
629685,A055,R227,00-00-03,RECTOR ST,NRW,BMT,09/28/2019,00:00:00,REGULAR,3265726,1082738,2019-09-28,5,Saturday,0.0,0.0
804160,R509,R121,00-00-03,QUEENSBORO PLZ,7NQW,IRT,09/28/2019,00:00:00,REGULAR,5958818,1379084,2019-09-28,5,Saturday,0.0,0.0
685441,N116,R198,00-03-01,NOSTRAND AV,AC,IND,09/28/2019,00:00:00,REGULAR,7286534,3621278,2019-09-28,5,Saturday,0.0,0.0
814412,R601A,R108,02-00-05,BOROUGH HALL,2345R,IRT,09/28/2019,00:00:00,REGULAR,4213736,6570765,2019-09-28,5,Saturday,0.0,0.0
633327,B013,R196,01-00-01,PROSPECT PARK,BQS,BMT,09/28/2019,00:00:00,REGULAR,13710455,10097193,2019-09-28,5,Saturday,0.0,0.0
671028,N057,R188,00-05-00,50 ST,CE,IND,09/28/2019,00:00:00,REGULAR,3539,0,2019-09-28,5,Saturday,0.0,0.0
779699,R238,R046,00-03-04,GRD CNTRL-42 ST,4567S,IRT,09/28/2019,00:00:00,REGULAR,37660023,9679136,2019-09-28,5,Saturday,0.0,0.0


In [22]:
# look at the totals for entrioes and exits 
entries_exit_totals = ordered_date_df.groupby(["STATION"])[["ENTRIES_DIFF", "EXIT_DIFF"]].sum()
entries_exit_totals

Unnamed: 0_level_0,ENTRIES_DIFF,EXIT_DIFF
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1
1 AV,408104.0,450796.0
103 ST,753748.0,545580.0
103 ST-CORONA,515401.0,381631.0
104 ST,-3158009000.0,-2539205000.0
110 ST,270234.0,241455.0
111 ST,452087.0,234678.0
116 ST,465680.0,232180.0
116 ST-COLUMBIA,413453.0,206005.0
121 ST,54766.0,35687.0
125 ST,1892728.0,1640675.0


In [29]:
# stations with negative entry or exit values... fishy
negative_totals = entries_exit_totals[(entries_exit_totals["ENTRIES_DIFF"] < 0.0) | (entries_exit_totals["EXIT_DIFF"] < 0.0)]
negative_totals

Unnamed: 0_level_0,ENTRIES_DIFF,EXIT_DIFF
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1
104 ST,-3158009000.0,-2539205000.0
190 ST,27752.0,-18906.0
207 ST,39456.0,-91562.0
23 ST,-150453500.0,1288486.0
34 ST-HUDSON YD,-2703343.0,6548.0
42 ST-BRYANT PK,-4429612.0,-6250974.0
47-50 STS ROCK,-812764.0,-1396884.0
59 ST,-701825800.0,-720479900.0
59 ST COLUMBUS,-47823040.0,-15181260.0
ASTOR PL,-16842460.0,-1512901.0


In [31]:
len(negative_totals["ENTRIES_DIFF"])

21

In [34]:
entries_exit_totals["COMBINED"] = entries_exit_totals["ENTRIES_DIFF"] + entries_exit_totals["EXIT_DIFF"]
entries_exit_totals.sort_values(by=["COMBINED"], )

Unnamed: 0_level_0,ENTRIES_DIFF,EXIT_DIFF,COMBINED
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
104 ST,-3158009000.0,-2539205000.0,-5697214000.0
GUN HILL RD,-1186245000.0,-885473300.0,-2071719000.0
59 ST,-701825800.0,-720479900.0,-1422306000.0
23 ST,-150453500.0,1288486.0,-149165000.0
59 ST COLUMBUS,-47823040.0,-15181260.0,-63004290.0
CHAMBERS ST,-3985218.0,-15338660.0,-19323880.0
ASTOR PL,-16842460.0,-1512901.0,-18355360.0
FORDHAM RD,-8556864.0,-9192013.0,-17748880.0
42 ST-BRYANT PK,-4429612.0,-6250974.0,-10680590.0
AVENUE X,-5612119.0,-2957163.0,-8569282.0
