In [2]:
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
import time
from random import uniform
import sys
from bs4 import BeautifulSoup
import re
from fake_useragent import UserAgent

In [3]:
url = "http://web.mta.info/developers/turnstile.html"

In [4]:
response = requests.get(url)
dom = BeautifulSoup(response.content, 'lxml')

In [5]:
urls_list = []
urls = dom.find_all("div")[11].find_all("a")
for i, url in enumerate(urls):
    url_detail = str(urls[i]).split('"')[1]
    urls_list.append(url_detail)

In [6]:
urls_list[:5]

['data/nyct/turnstile/turnstile_201107.txt',
 'data/nyct/turnstile/turnstile_201031.txt',
 'data/nyct/turnstile/turnstile_201024.txt',
 'data/nyct/turnstile/turnstile_201017.txt',
 'data/nyct/turnstile/turnstile_201010.txt']

## Web crawling for the past 5 years (2015~2020)

**Field Description**

- C/A = Control Area (A002)
- UNIT = Remote Unit for a station (R051)
- SCP = Subunit Channel Position represents an specific address for a device (02-00-00)
- DATEn = Represents the date (MM-DD-YY)
- TIMEn = Represents the time (hh:mm:ss) for a scheduled audit event
- DEScn = Represent the "REGULAR" scheduled audit event (occurs every 4 hours)
- ENTRIESn = The comulative entry register value for a device
- EXISTn = The cumulative exit register value for a device

In [62]:
data = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_201107.txt')
data.head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,00:00:00,REGULAR,7481098,2545767
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,04:00:00,REGULAR,7481103,2545767
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,08:00:00,REGULAR,7481117,2545786
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,12:00:00,REGULAR,7481158,2545831
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,16:00:00,REGULAR,7481285,2545868
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,20:00:00,REGULAR,7481453,2545895
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,11/01/2020,00:00:00,REGULAR,7481511,2545902
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,11/01/2020,03:00:00,REGULAR,7481511,2545904
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,11/01/2020,07:00:00,REGULAR,7481514,2545916
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,11/01/2020,11:00:00,REGULAR,7481539,2545945


In [63]:
stations_df = pd.read_csv("https://raw.githubusercontent.com/toddwschneider/nyc-subway-turnstile-data/16d45bab6104b443bffb2f62bedad4ad587d3e96/lib/stations.csv")
stations_df.head(5)

Unnamed: 0,station,line_names,division,borough
0,1 AV,L,BMT,Manhattan
1,1 AVE,L,BMT,Manhattan
2,103 ST,1,IRT,Manhattan
3,103 ST,6,IRT,Manhattan
4,103 ST,BC,IND,Manhattan


In [64]:
len(stations_df["station"].unique()), len(data["STATION"].unique())

(572, 378)

### map station to boroughs

In [65]:
mapping = dict(stations_df[['station', 'borough']].values)
data['borough'] = data.STATION.map(mapping)
data.head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,borough
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,00:00:00,REGULAR,7481098,2545767,Brooklyn
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,04:00:00,REGULAR,7481103,2545767,Brooklyn
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,08:00:00,REGULAR,7481117,2545786,Brooklyn
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,12:00:00,REGULAR,7481158,2545831,Brooklyn
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,16:00:00,REGULAR,7481285,2545868,Brooklyn


In [66]:
data["borough"].unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Bronx', 'New Jersey',
       'Staten Island'], dtype=object)

### get daily ridership

In [67]:
#get rid of white spaces in column names
data.columns = ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME','DESC', 'ENTRIES','EXITS', 'BOROUGH']

In [68]:
# get the lagged column to calculate net entry counts and net exit counts
data['entries_lagged'] = data.groupby(['STATION'])['ENTRIES'].shift(1)
data['exits_lagged'] = data.groupby(['STATION'])['EXITS'].shift(1)
data['net_exit_counts'] = abs(data["exits_lagged"] - data["EXITS"])
data['net_entry_counts'] = abs(data["entries_lagged"] - data["ENTRIES"])
data.head(2)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,BOROUGH,entries_lagged,exits_lagged,net_exit_counts,net_entry_counts
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,00:00:00,REGULAR,7481098,2545767,Brooklyn,,,,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,04:00:00,REGULAR,7481103,2545767,Brooklyn,7481098.0,2545767.0,0.0,5.0


In [69]:
data = data.drop(columns=['ENTRIES', 'EXITS', 'entries_lagged', 'exits_lagged'])
data.head(2)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,BOROUGH,net_exit_counts,net_entry_counts
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,00:00:00,REGULAR,Brooklyn,,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/31/2020,04:00:00,REGULAR,Brooklyn,0.0,5.0


In [70]:
data_daily = data.groupby(by=['DATE']).sum()
data_daily

Unnamed: 0_level_0,net_exit_counts,net_entry_counts
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
10/31/2020,279118200000.0,333272500000.0
11/01/2020,8423823.0,154918300.0
11/02/2020,5041713.0,6204839.0
11/03/2020,1583897.0,1579394.0
11/04/2020,1643021.0,1721350.0
11/05/2020,10065830.0,8385331.0
11/06/2020,3404522.0,7080616.0


In [57]:
data_by_borough = data.groupby(by=['BOROUGH','DATE']).sum()
data_by_borough

Unnamed: 0_level_0,Unnamed: 1_level_0,net_exit_counts,net_entry_counts
BOROUGH,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1
Bronx,10/31/2020,50783090000.0,69848610000.0
Bronx,11/01/2020,89037.0,82360.0
Bronx,11/02/2020,157290.0,185700.0
Bronx,11/03/2020,153381.0,174191.0
Bronx,11/04/2020,163090.0,188110.0
Bronx,11/05/2020,166470.0,194571.0
Bronx,11/06/2020,188697.0,655945.0
Brooklyn,10/31/2020,71185060000.0,82833650000.0
Brooklyn,11/01/2020,286213.0,253637.0
Brooklyn,11/02/2020,2339607.0,4074852.0


In [73]:
data_by_borough.loc["Bronx"]

Unnamed: 0_level_0,net_exit_counts,net_entry_counts
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
10/31/2020,50783090000.0,69848610000.0
11/01/2020,89037.0,82360.0
11/02/2020,157290.0,185700.0
11/03/2020,153381.0,174191.0
11/04/2020,163090.0,188110.0
11/05/2020,166470.0,194571.0
11/06/2020,188697.0,655945.0


In [61]:
data.groupby(["STATION"])["C/A"].value_counts()

STATION        C/A  
1 AV           H007A    590
               H007     215
               H008     215
103 ST         R252     252
               R170     251
                       ... 
WTC-CORTLANDT  R108     210
               R108A    126
               R109     126
YORK ST        N530     126
ZEREGA AV      R419     252
Name: C/A, Length: 750, dtype: int64

## Combine All together

In [101]:
base_url = "http://web.mta.info/developers/"

# initialize dataframes
daily_df = pd.DataFrame()

bronx_df = pd.DataFrame()
manhattan_df = pd.DataFrame()
queens_df = pd.DataFrame()
brooklyn_df = pd.DataFrame()
staten_island_df = pd.DataFrame()
new_jersey_df = pd.DataFrame()

for i in range(306):
    url = base_url + urls_list[i]
    data = pd.read_csv(url)
    
    # map station to borough
    mapping = dict(stations_df[['station', 'borough']].values)
    data['borough'] = data.STATION.map(mapping)
    data.columns = ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME','DESC', 'ENTRIES','EXITS', 'BOROUGH']
    
    # get the lagged column to calculate net entry counts and net exit counts
    data['entries_lagged'] = data.groupby(['STATION'])['ENTRIES'].shift(1)
    data['exits_lagged'] = data.groupby(['STATION'])['EXITS'].shift(1)
    data['net_exit_counts'] = abs(data["exits_lagged"] - data["EXITS"])
    data['net_entry_counts'] = abs(data["entries_lagged"] - data["ENTRIES"])
    
    data.drop(columns=['ENTRIES', 'EXITS', 'entries_lagged', 'exits_lagged'], inplace=True)
    
    data_daily = data.groupby(by=['DATE']).sum()
    data_by_borough = data.groupby(by=['BOROUGH','DATE']).sum()
    data_daily.reset_index(inplace=True)
    data_by_borough.reset_index(inplace=True, level=['DATE'])
    
    #change date column to datetime to later sort by date
    data_daily['DATE'] = pd.to_datetime(data_daily['DATE'])
    data_by_borough['DATE'] = pd.to_datetime(data_by_borough['DATE'])
    
    #delete dates before 01/01/2015
    data_daily = data_daily[~(data_daily['DATE'] < '2015-01-01')]
    data_by_borough = data_by_borough[~(data_by_borough['DATE'] < '2015-01-01')]
    
    #update dataframes
    daily_df = daily_df.append(data_daily, ignore_index=True, sort=False)
    
    bronx_df = bronx_df.append(data_by_borough.loc["Bronx"], ignore_index=True, sort=False)
    manhattan_df = manhattan_df.append(data_by_borough.loc["Manhattan"], ignore_index=True, sort=False)
    queens_df = queens_df.append(data_by_borough.loc["Queens"], ignore_index=True, sort=False)
    brooklyn_df = brooklyn_df.append(data_by_borough.loc["Brooklyn"], ignore_index=True, sort=False)
    staten_island_df = staten_island_df.append(data_by_borough.loc["Staten Island"], ignore_index=True, sort=False)
    new_jersey_df = new_jersey_df.append(data_by_borough.loc["New Jersey"], ignore_index=True, sort=False)

# sort by date
daily_df.sort_values(by=['DATE'], inplace=True)
bronx_df.sort_values(by=['DATE'], inplace=True)
manhattan_df.sort_values(by=['DATE'], inplace=True)
queens_df.sort_values(by=['DATE'], inplace=True)
brooklyn_df.sort_values(by=['DATE'], inplace=True)
staten_island_df.sort_values(by=['DATE'], inplace=True)
new_jersey_df.sort_values(by=['DATE'], inplace=True)

In [106]:
# export csv
daily_df.reset_index(drop=True, inplace=True)
bronx_df.reset_index(drop=True, inplace=True)
manhattan_df.reset_index(drop=True, inplace=True)
queens_df.reset_index(drop=True, inplace=True)
brooklyn_df.reset_index(drop=True, inplace=True)
staten_island_df.reset_index(drop=True, inplace=True)
new_jersey_df.reset_index(drop=True, inplace=True)

daily_df.to_csv('daily_subway_count', index=False)
bronx_df.to_csv('bronx_subway_count', index=False)
manhattan_df.to_csv('manhattan_subway_count', index=False)
queens_df.to_csv('queens_subway_count', index=False)
brooklyn_df.to_csv('brooklyn_subway_count', index=False)
staten_island_df.to_csv('staten_island_subway_count', index=False)
new_jersey_df.to_csv('new_jersey_subway_count', index=False)