In [1]:
import io
import pandas as pd
import requests
from datetime import datetime as dt
import matplotlib.pyplot as plt

In [2]:
file_url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_180623.txt"
s = requests.get(file_url).content
turns_df = pd.read_csv(io.StringIO(s.decode('utf-8')))

In [3]:
len(turns_df)

196686

In [4]:
turns_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

In [5]:
turns_df.rename(columns={column:column.strip() for column in turns_df.columns}, inplace=True)
turns_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [6]:
turns_df["DATE_TIME"] = pd.to_datetime(turns_df.DATE + " " + turns_df.TIME, format="%m/%d/%Y %H:%M:%S")
turns_stations = turns_df 
turns_stations[["REAL_ENTRIES", "REAL_EXITS"]] = (turns_stations.groupby(["C/A", "UNIT", "SCP", "STATION"])
                                                  ["ENTRIES", "EXITS"]
                                                  .transform(lambda x: x - x.shift(1)))
turns_stations.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,REAL_ENTRIES,REAL_EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,00:00:00,REGULAR,6658725,2257032,2018-06-16 00:00:00,,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,04:00:00,REGULAR,6658747,2257036,2018-06-16 04:00:00,22.0,4.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,08:00:00,REGULAR,6658766,2257068,2018-06-16 08:00:00,19.0,32.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,12:00:00,REGULAR,6658866,2257139,2018-06-16 12:00:00,100.0,71.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,16:00:00,REGULAR,6659043,2257188,2018-06-16 16:00:00,177.0,49.0


In [7]:
# Percentage of NaN in real entries or real exits overall
100 * len(turns_stations[(turns_stations["REAL_ENTRIES"].isnull()) | (turns_stations["REAL_EXITS"].isnull())])/len(turns_stations)

2.3946798450321833

In [8]:
turns_stations.dropna(subset=["REAL_ENTRIES", "REAL_EXITS"], axis=0, inplace=True)
turns_stations["TOTAL_REAL"] = turns_stations["REAL_ENTRIES"] + turns_stations["REAL_EXITS"]
turns_stations.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME,REAL_ENTRIES,REAL_EXITS,TOTAL_REAL
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,04:00:00,REGULAR,6658747,2257036,2018-06-16 04:00:00,22.0,4.0,26.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,08:00:00,REGULAR,6658766,2257068,2018-06-16 08:00:00,19.0,32.0,51.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,12:00:00,REGULAR,6658866,2257139,2018-06-16 12:00:00,100.0,71.0,171.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,16:00:00,REGULAR,6659043,2257188,2018-06-16 16:00:00,177.0,49.0,226.0
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/16/2018,20:00:00,REGULAR,6659281,2257223,2018-06-16 20:00:00,238.0,35.0,273.0


In [9]:
# Percentage of negative entries in real entries overall
100 * len(turns_stations[turns_stations["REAL_ENTRIES"] < 0])/len(turns_stations)

0.8855273575863649

In [10]:
# Percentage of negative entries in real exits overall
100 * len(turns_stations[turns_stations["REAL_EXITS"] < 0])/len(turns_stations)

0.7381130974705171

In [11]:
# Percentage of negative entries in real entries per station
100 * (turns_stations[turns_stations["REAL_ENTRIES"] < 0].groupby(["STATION"]).size()/turns_stations.groupby(["STATION"])
 .size()).dropna().sort_values(ascending = False).head(10)

STATION
190 ST             29.166667
104 ST             22.222222
6 AV               20.000000
174 ST             16.666667
EASTCHSTER/DYRE    14.146341
BAY PKWY           13.750000
3 AV-149 ST        13.333333
57 ST               8.565310
GRAND-NEWTOWN       8.333333
METROPOLITAN AV     7.142857
dtype: float64

In [12]:
# Percentage of negative entries in real exits per station
100 * (turns_stations[turns_stations["REAL_EXITS"] < 0].groupby(["STATION"]).size()/turns_stations.groupby(["STATION"])
 .size()).dropna().sort_values(ascending = False).head(10)

STATION
225 ST           33.333333
190 ST           30.833333
AVENUE I         23.170732
104 ST           22.222222
6 AV             20.000000
174 ST           16.666667
HIGH ST          16.666667
57 ST-7 AV       14.285714
PROSPECT PARK    12.500000
75 AV            10.000000
dtype: float64

In [13]:
#Droping all negative entries  
turns_stations = (turns_stations.drop(turns_stations
                    [(turns_stations["REAL_EXITS"] < 0) | (turns_stations["REAL_ENTRIES"] < 0)].index))

In [14]:
# Where and when we have more people
(turns_stations.groupby(["STATION","DATE_TIME"])).sum().sort_values("TOTAL_REAL", ascending = False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,ENTRIES,EXITS,REAL_ENTRIES,REAL_EXITS,TOTAL_REAL
STATION,DATE_TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CANAL ST,2018-06-18 09:00:00,5429112866,4273863454,2105703000.0,618378596.0,2724082000.0
LEXINGTON AV/53,2018-06-19 12:00:00,2355056998,1808673956,631933.0,404188.0,1036121.0
THIRTY THIRD ST,2018-06-18 06:25:59,337145,239224,337145.0,238191.0,575336.0
34 ST-HERALD SQ,2018-06-21 20:00:00,1552119904,1353681276,49789.0,34187.0,83976.0
34 ST-HERALD SQ,2018-06-20 20:00:00,1551998407,1353569505,51406.0,32124.0,83530.0
34 ST-HERALD SQ,2018-06-19 20:00:00,1551878180,1353459421,50683.0,32174.0,82857.0
34 ST-PENN STA,2018-06-21 20:00:00,1909588649,1709954720,34856.0,46701.0,81557.0
34 ST-PENN STA,2018-06-19 20:00:00,1909343179,1709729668,34708.0,46464.0,81172.0
34 ST-PENN STA,2018-06-20 20:00:00,1909464686,1709840091,35012.0,46007.0,81019.0
34 ST-HERALD SQ,2018-06-18 20:00:00,1551761983,1353352917,48044.0,29812.0,77856.0
