# COVID Testing and Vaccines and Health Equity 


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


## 1. Import and clean MTA turnstile data
Source: http://web.mta.info/developers/turnstile.html

In [2]:
# Load data from April, May, and June 2019, data is loaded on Saturdays.

def get_data_parse_dt(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url, parse_dates=[['DATE','TIME']], usecols = [0,1,2,3,4,6,7,8,9], keep_date_col=True))
    return pd.concat(dfs)

# Create a datetime column using data from columns DATE and TIME during import
week_nums = [190406, 190413, 190420]
df = get_data_parse_dt(week_nums)
df.tail()
# , 190427, 190504, 190511, 190518, 190525, 190601, 190608, 190615, 190622, 190629

Unnamed: 0,DATE_TIME,C/A,UNIT,SCP,STATION,LINENAME,DATE,TIME,DESC,ENTRIES
202542,2019-04-19 05:00:00,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,04/19/2019,05:00:00,REGULAR,5554
202543,2019-04-19 09:00:00,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,04/19/2019,09:00:00,REGULAR,5554
202544,2019-04-19 13:00:00,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,04/19/2019,13:00:00,REGULAR,5554
202545,2019-04-19 17:00:00,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,04/19/2019,17:00:00,REGULAR,5554
202546,2019-04-19 21:00:00,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,04/19/2019,21:00:00,REGULAR,5554


In [3]:
df.columns

Index(['DATE_TIME', 'C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DATE',
       'TIME', 'DESC', 'ENTRIES'],
      dtype='object')

In [4]:
# Strip whitespace from column names
df.columns = [column.strip() for column in df.columns]
df.columns

Index(['DATE_TIME', 'C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DATE',
       'TIME', 'DESC', 'ENTRIES'],
      dtype='object')

## 2. Select late night entry data

In [5]:
late_night_mask = df.copy()

In [6]:
# Mask selects the interval between 12:00 AM and 4:00 AM 
late_night_mask = (df['TIME'] == '04:00:00')
late_night_df = df[late_night_mask]
late_night_df

Unnamed: 0,DATE_TIME,C/A,UNIT,SCP,STATION,LINENAME,DATE,TIME,DESC,ENTRIES
1,2019-03-30 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,03/30/2019,04:00:00,REGULAR,6999084
7,2019-03-31 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,03/31/2019,04:00:00,REGULAR,6999983
13,2019-04-01 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,04/01/2019,04:00:00,REGULAR,7000539
19,2019-04-02 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,04/02/2019,04:00:00,REGULAR,7002096
25,2019-04-03 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,04/03/2019,04:00:00,REGULAR,7003693
...,...,...,...,...,...,...,...,...,...,...
201806,2019-04-15 04:00:00,S101A,R070,01-05-01,ST. GEORGE,1,04/15/2019,04:00:00,REGULAR,625
201812,2019-04-16 04:00:00,S101A,R070,01-05-01,ST. GEORGE,1,04/16/2019,04:00:00,REGULAR,626
201818,2019-04-17 04:00:00,S101A,R070,01-05-01,ST. GEORGE,1,04/17/2019,04:00:00,REGULAR,626
201824,2019-04-18 04:00:00,S101A,R070,01-05-01,ST. GEORGE,1,04/18/2019,04:00:00,REGULAR,626


In [None]:
# Confirm df only has data from the 4:00 AM interval
late_night_df.TIME.unique()

## 3. Calculate late night entries

In [None]:
late_night_entries_df = late_night_df.copy()
late_night_entries_df.head(2)

In [None]:
# What type of data are in the DESC column? REGULAR and RECOVR AUD
# How many of the entries are RECOVR AUD? Can we filter them out? Yes, small percentage. 
late_night_entries_df.DESC.value_counts()

In [None]:
# Mask selects the REGULAR entries in the DESC column
desc_mask = (late_night_entries_df['DESC'] == 'REGULAR')
late_night_entries_df = (late_night_entries_df[desc_mask])

In [None]:
# Confirm that "C/A", "UNIT", "SCP", "STATION", "DATE_TIME" is unique
(late_night_entries_df
 .groupby(["C/A", "UNIT", "SCP", "STATION", "DATE_TIME"])
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

In [None]:
# Create new columns for the previous date and entries. 
# Apply a shift to calculate the previous day's entries. 

v = (late_night_entries_df[["DATE", "ENTRIES"]].apply(lambda grp: grp.shift(1)))
late_night_entries_df['PREV_DATE'] = v['DATE']
late_night_entries_df['PREV_ENTRIES'] = v['ENTRIES']

late_night_entries_df.head(2)

In [None]:
# Drop row with the March 31, 2019 data, used it to calculate the previous entries, no longer neeeded. 
late_night_entries_df.dropna(subset=["PREV_DATE"], axis=0, inplace=True)
late_night_entries_df.head(2)


In [None]:
# How many stations have a counter going in reverse? 
(late_night_entries_df[late_night_entries_df['ENTRIES'] < late_night_entries_df["PREV_ENTRIES"]]
    .groupby(["C/A", "UNIT", "SCP", "STATION"])
    .size())


In [None]:
# Adjust counter before calculating daily entries
def get_daily_counts(row, max_counter):
    if row['ENTRIES'] > row['PREV_ENTRIES']:
        counter = row['ENTRIES']
    else:
        counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    return counter

late_night_entries_df["LATE_NIGHT_ENTRIES"] = late_night_entries_df.apply(get_daily_counts, axis=1, max_counter=1000000)


In [None]:
# Add up all turnstiles per station and reset index 
late_night_entries_df = (late_night_entries_df
                         .groupby(['STATION', 'DATE', 'LINENAME', 'TIME'])
                         [['LATE_NIGHT_ENTRIES']]
                         .sum()).reset_index()
late_night_entries_df.head(2)

In [None]:
# Add column DAY_OF_WEEK to display day of week (Monday = 0, Sunday = 6)
late_night_entries_df['DAY_OF_WEEK'] = pd.to_datetime(late_night_entries_df['DATE']).dt.dayofweek
late_night_entries_df.head(2)


## 4. Calculate rolling mean of late night entries

In [None]:
rolling_mean_df = late_night_entries_df.copy()
rolling_mean_df.head(2)

In [None]:
# Add column ROLLING_MEAN to capture the weekly rolling mean for each station 
rolling_mean_df['ROLLING_MEAN'] = (rolling_mean_df
                        .groupby('STATION')['LATE_NIGHT_ENTRIES']
                        .transform(lambda x: x.rolling(7,1).mean()))
# rolling_mean_df[['STATION','DATE','LATE_NIGHT_ENTRIES','ROLLING_MEAN','DAY_OF_WEEK']].head(2) 
rolling_mean_df

In [None]:
# Mask selects the last day of each station's rolling mean; Sunday=6
last_rolling_day_mask = (rolling_mean_df['DAY_OF_WEEK'] == 6)
rolling_mean_df = rolling_mean_df[last_rolling_day_mask]
rolling_mean_df

In [None]:
# Comfirm df only has Sunday
rolling_mean_df.DAY_OF_WEEK.unique()

In [None]:
# Confirm one row per station: 486 stations and 486 rows
print('Num stations:',rolling_mean_df.STATION.count())
print('\n')
rolling_mean_df.info()

In [None]:
# Sort by ROLLING_MEAN and reset index 
rolling_mean_df = rolling_mean_df.sort_values('ROLLING_MEAN', ascending=False).reset_index()
rolling_mean_df

In [None]:
rolling_mean_top_ten = rolling_mean_df.loc[:20]
rolling_mean_top_ten.head(20)

### What stations outside of Manhattan have high late night entries on average?

In [None]:
# # Plot top ten stations with the highest weekly rolling mean on Sunday vs stations

# plt.figure(figsize = (15,8))
# ax = sns.barplot(x = 'STATION', y = 'ROLLING_MEAN',data = rolling_mean_top_ten, color='purple')
# ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
# ax.set(title='Rolling mean number of entries vs. Stations');
# ax.set(xlabel=('Station'))
# ax.set(ylabel=('Rolling mean number of entries on last reporting day (Sunday)'))
# plt.grid();


The stations outside of Manhattan with high late night entries on average:
* Atlantic Avenue-Barclay (Brooklyn, 2345BDNQR trains)
* Flushing-Main Street (Queens, 7 train)
* Jackson Heights-Roosevelt (Queens, EFMR7 trains)

Two stations are both in Queens and on the 7 train. 
Which 7 train stations in Queens have high late night traffic? 

# 7 Train: From Flushing Main St to 34 St Hudson Yards

In [None]:
seven_line_df = rolling_mean_df.copy() # Using df with rolling mean, before sliced for top 10
seven_line_df.head(2)

In [None]:
# Looking for linename of interest: 7 line 
seven_line_df = seven_line_df[seven_line_df['LINENAME'].str.contains('7', na=False)] 
seven_line_df.head(10)
# The 7 train shares stations with other train lines. Will omit multi-train stations in this analysis. 
# Line name as recorded in dataset: 7


In [None]:
seven_mask = (seven_line_df['LINENAME'] == '7')
seven_line_df = seven_line_df[seven_mask]
seven_line_df.head(2)

In [None]:
# Comfirm df has only 7 line data
seven_line_df.LINENAME.unique()

In [None]:
# How many 7 train stations in this dataset? 12
len(seven_line_df.STATION.unique())

In [None]:
# How many stations in Queens? All but one, 34 ST-HUDSON YD.   
seven_line_df.STATION.unique()

In [None]:
# Maks selects only the Queens stations
queens_mask = (seven_line_df['STATION'] != '34 ST-HUDSON YD')
seven_line_df = seven_line_df[queens_mask]
seven_line_df.head(20)

### Which 7 train stations in Queens have high late night traffic? 

In [None]:
# Plot 7 line in Queens: Rolling average late night entries vs Queens 7 train stations

plt.figure(figsize = (15,8))
ax = sns.barplot(x = 'STATION', y = 'ROLLING_MEAN',data = seven_line_df.sort_values('ROLLING_MEAN'), color='purple')
ax.set_xticklabels(ax.get_xticklabels(),rotation = 45)
ax.set(title='Rolling Average Late Night Entries vs. Queens 7 Train Stations')
ax.set(xlabel=('Queens 7 Train Stations'))
ax.set(ylabel=('Rolling average late night entries'))
plt.grid();


# DOUBLE CHECK NUMBERS
The range of people entering a station on the 7 train during late-night hours is ~8,000 to ~48,000 

* Flushing-Main Street: ~40,000
* Junction Boulevard: ~16,000
* 103 Street-Corona: ~15,000

One station has high late night traffic. What days are the busiest at Flushing Station? 



## Flushing Station

In [None]:
flushing_df = late_night_entries_df.copy() #Use df before rolling mean and extract Flushing. 

In [None]:
# Mask selects FLUSHING-MAIN
flushing_mask = (flushing_df['STATION'] == 'FLUSHING-MAIN')

In [None]:
flushing_df = flushing_df[flushing_mask]
flushing_df.STATION.unique()
# confirm df has data from station of interest

In [None]:
flushing_df.head(20)

In [None]:
# Calculate Flushing WEEK_DAY_MEAN
flushing_df['WEEK_DAY_MEAN'] = (flushing_df
                                .groupby('DAY_OF_WEEK')['LATE_NIGHT_ENTRIES']
                                .transform(lambda x: x.mean()))
flushing_df.head(2)

### What days are the busiest at Flushing Station?

In [None]:
# Plot Flushing : Weekday average late night entries vs day of week

plt.figure(figsize = (15,8))
ax = sns.barplot(x = 'DAY_OF_WEEK', y = 'WEEK_DAY_MEAN',data = flushing_df, color='purple')
ax.set(title='Weekday Average Late Night Entries vs Day of Week')
ax.set(xlabel=('Day of week'))
ax.set_xticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'],rotation = 0)
ax.set(ylabel=('Weekday average late night entries'))

plt.grid();


Two week days have the highest late-night traffic at the Flushing station:
* Thursdays: ~60,000
* Friday: ~60,000