# MTA Analysis: Data Acquisition & Cleaning
---

### Setup
#### Import required libraries.

In [1]:
import pandas as pd
import numpy as np
import os

#### Define useful functions (maybe document and put this in .py file later?)

In [35]:
def convert_timestamp_to_mta_format(timestamp):
    """
    """
    year = str(timestamp.year)[2:4]
    month = str(timestamp.month).zfill(2)
    day = str(timestamp.day).zfill(2)
    
    date_mta_format = year + month + day
    
    return date_mta_format

def clean_hourly_turnstile_traffic(turnstile_data_row, reset_limit, entries=True):
    """
    Calculates hourly turnstile traffic (entries or exits) using current day and previous day readings.
    Adjusts for erroneous reverse counting and turnstile count resets.
    
    :param pandas.Series turnstile_data_row: Row of data corresponding to a single 4-hour span of 
                                             data for a single turnstile.
    :param int reset_limit: Used as an upper limit to determine if turnstile reset.
    :return: Total hourly traffic (entries or exits) for a single turnstile.
    :rtype: int
    """
    traffic_type = 'ENTRIES' if entries else 'EXITS'
    
    hourly_traffic = turnstile_data_row["HOURLY_" + traffic_type]
    
    # reverse sign again if median is negative
    hourly_traffic = -hourly_traffic if hourly_traffic < 0 else hourly_traffic

    # if counter was possibly reset to 0, set to median value for that turnstile/day of week/time
    if hourly_traffic > reset_limit:
        hourly_traffic = abs(turnstile_data_row[traffic_type + "_MEDIAN"])

    # if the reset limit is still reached with median, set to NaN     
    if hourly_traffic > reset_limit:
        hourly_traffic = np.nan

    return hourly_traffic

### Data Acquisition
Load [MTA Turnstile Data](http://web.mta.info/developers/turnstile.html) and combine into a single dataframe.
_Note: The files already exist in the repository. This code block can be skipped over._

In [3]:
start_date = '2015-05-02'
months_of_interest = [5, 6]
start_ts = pd.Timestamp(start_date)

In [None]:
current_timestamp = start_ts
is_date_valid = True
raw_master_df = pd.DataFrame()
while is_date_valid:
    if current_timestamp.month not in months_of_interest:
        current_timestamp += pd.DateOffset(days=7)
        continue
    
    print(f'Downloading data for {current_timestamp}...')
    date_formatted = convert_timestamp_to_mta_format(current_timestamp)
    
    # load data and write to csv
    url = f'http://web.mta.info/developers/data/nyct/turnstile/turnstile_{date_formatted}.txt'
    df_turnstile_data = pd.read_csv(url)
    raw_master_df = pd.concat([raw_master_df, df_turnstile_data])
    
    # add 7 days to get next file. if resulting date is later than today, then stop loop
    current_timestamp += pd.DateOffset(days=7)
    if current_timestamp > pd.to_datetime("now"):
        is_date_valid = False

Export combined dataframe to pickle.

In [None]:
raw_master_df.to_pickle('raw_mta_turnstile_data_mayjune_20152020.pkl')

### Data Cleaning
Load pickled data.

In [4]:
df_mta_raw = pd.read_pickle('raw_mta_turnstile_data_mayjune_20152020.pkl')
df_mta_raw.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,00:00:00,REGULAR,5106770,1729635
1,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,04:00:00,REGULAR,5106810,1729649
2,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,08:00:00,REGULAR,5106835,1729680
3,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,12:00:00,REGULAR,5106961,1729784
4,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,16:00:00,REGULAR,5107250,1729858


Remove any whitespace from column names and initialize a new dataframe to store cleaned data.

In [5]:
df_mta_clean = df_mta_raw
df_mta_clean.columns = df_mta_clean.columns.str.replace(' ', '')

Determine if there are any NaN values.

In [6]:
df_mta_raw[df_mta_clean.isna()].count()

C/A         0
UNIT        0
SCP         0
STATION     0
LINENAME    0
DIVISION    0
DATE        0
TIME        0
DESC        0
ENTRIES     0
EXITS       0
dtype: int64

No NaN values to filter.

Add `DATETIME` column to dataset.

In [7]:
df_mta_clean['DATETIME'] = pd.to_datetime(df_mta_clean['DATE'] + ' ' + df_mta_clean['TIME'])

Check if there are duplicates by determining if there are multiple readings for a given turnstile at a given time.

In [8]:
df_checking_duplicates = (df_mta_clean.groupby(["C/A", "UNIT", "SCP", "STATION", "DATETIME"]).ENTRIES.count().reset_index()).sort_values('ENTRIES')
df_checking_duplicates

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2016-04-30 00:00:00,1
6896126,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-26 04:00:00,1
6896127,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-26 08:00:00,1
6896128,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-26 12:00:00,1
6896129,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-26 16:00:00,1
...,...,...,...,...,...,...
6410336,R101,R001,02-00-00,SOUTH FERRY,2018-05-23 09:00:00,2
2073816,JFK03,R536,00-00-05,JFK JAMAICA CT1,2015-05-14 05:00:00,2
4774408,N418,R269,01-05-00,BEDFORD-NOSTRAN,2016-05-16 16:00:00,2
4774415,N418,R269,01-05-00,BEDFORD-NOSTRAN,2016-05-17 08:00:00,2


In [9]:
df_checking_duplicates = df_checking_duplicates[df_checking_duplicates['ENTRIES'] == 2]
df_checking_duplicates

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
9359628,R518,R261,00-03-02,40 ST LOWERY ST,2017-06-05 08:00:00,2
5189785,N525,R142,01-00-03,DELANCEY/ESSEX,2019-05-11 05:00:00,2
811622,B020,R263,00-06-01,AVENUE H,2016-05-30 12:00:00,2
4290068,N329,R201,00-03-04,WOODHAVEN BLVD,2015-05-20 05:00:00,2
8513239,R290,R161,00-00-00,KINGSBRIDGE RD,2019-06-07 05:00:00,2
...,...,...,...,...,...,...
6410336,R101,R001,02-00-00,SOUTH FERRY,2018-05-23 09:00:00,2
2073816,JFK03,R536,00-00-05,JFK JAMAICA CT1,2015-05-14 05:00:00,2
4774408,N418,R269,01-05-00,BEDFORD-NOSTRAN,2016-05-16 16:00:00,2
4774415,N418,R269,01-05-00,BEDFORD-NOSTRAN,2016-05-17 08:00:00,2


In [10]:
# TODO use a merge to get relevant rows from original df

Check to see what is causing the duplicate.

In [11]:
mask = (
    (df_mta_clean['C/A'] == 'R518') &
    (df_mta_clean['UNIT'] == 'R261') &
    (df_mta_clean['SCP'] == '00-03-02') &
    (df_mta_clean['STATION'] == '40 ST LOWERY ST') &
    (df_mta_clean['DATETIME'] == pd.to_datetime('2017-06-05 08:00:00'))
)
df_mta_clean[mask]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME
178845,R518,R261,00-03-02,40 ST LOWERY ST,7,IRT,06/05/2017,08:00:00,REGULAR,18778966,7453831,2017-06-05 08:00:00
178846,R518,R261,00-03-02,40 ST LOWERY ST,7,IRT,06/05/2017,08:00:00,RECOVR AUD,18778965,7453831,2017-06-05 08:00:00


Spot check randomly to see if REGULAR and RECOVR AUD readings are different.

In [12]:
mask = (
    (df_mta_clean['C/A'] == 'R290') &
    (df_mta_clean['UNIT'] == 'R161') &
    (df_mta_clean['SCP'] == '00-00-00') &
    (df_mta_clean['STATION'] == 'KINGSBRIDGE RD') &
    (df_mta_clean['DATETIME'] == pd.to_datetime('2019-06-07 05:00:00'))
)
df_mta_clean[mask]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME
169384,R290,R161,00-00-00,KINGSBRIDGE RD,4,IRT,06/07/2019,05:00:00,REGULAR,183445,164296,2019-06-07 05:00:00
169385,R290,R161,00-00-00,KINGSBRIDGE RD,4,IRT,06/07/2019,05:00:00,RECOVR AUD,183444,164296,2019-06-07 05:00:00


In [13]:
mask = (
    (df_mta_clean['C/A'] == 'N045') &
    (df_mta_clean['UNIT'] == 'R187') &
    (df_mta_clean['SCP'] == '01-00-02') &
    (df_mta_clean['STATION'] == '81 ST-MUSEUM') &
    (df_mta_clean['DATETIME'] == pd.to_datetime('2019-06-17 09:00:00'))
)
df_mta_clean[mask]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME
49121,N045,R187,01-00-02,81 ST-MUSEUM,BC,IND,06/17/2019,09:00:00,REGULAR,4343886,994027,2019-06-17 09:00:00
49122,N045,R187,01-00-02,81 ST-MUSEUM,BC,IND,06/17/2019,09:00:00,RECOVR AUD,4343885,994027,2019-06-17 09:00:00


In [14]:
df_mta_clean.DESC.value_counts()

REGULAR       10303603
RECOVR AUD       40700
Name: DESC, dtype: int64

Based on a spot check, it appears that where there are duplicates, the REGULAR and RECOVR AUD readings are identical.
We will assume that we can drop any duplicate entries (regardless of the type).

In [15]:
df_mta_clean.drop_duplicates(subset=["C/A", "UNIT", "SCP", "STATION", "DATETIME"], inplace=True)

Confirm no duplicates.

In [16]:
(df_mta_clean.groupby(["C/A", "UNIT", "SCP", "STATION", "DATETIME"]).ENTRIES.count().reset_index()).sort_values('ENTRIES')

Unnamed: 0,C/A,UNIT,SCP,STATION,DATETIME,ENTRIES
0,A002,R051,02-00-00,59 ST,2016-04-30 00:00:00,1
6896138,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-28 04:00:00,1
6896139,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-28 08:00:00,1
6896140,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-28 12:00:00,1
6896141,R145,R032,00-00-00,TIMES SQ-42 ST,2018-05-28 16:00:00,1
...,...,...,...,...,...,...
3448070,N135,R385,01-03-01,ROCKAWAY BLVD,2015-06-25 17:00:00,1
3448071,N135,R385,01-03-01,ROCKAWAY BLVD,2015-06-25 21:00:00,1
3448072,N135,R385,01-03-01,ROCKAWAY BLVD,2015-06-26 01:00:00,1
3448065,N135,R385,01-03-01,ROCKAWAY BLVD,2015-06-24 21:00:00,1


Drop columns `DESC`. Supress errors.

In [17]:
df_mta_clean = df_mta_clean.drop("DESC", axis=1, errors="ignore")
df_mta_clean.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME
0,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,00:00:00,5106770,1729635,2015-04-25 00:00:00
1,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,04:00:00,5106810,1729649,2015-04-25 04:00:00
2,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,08:00:00,5106835,1729680,2015-04-25 08:00:00
3,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,12:00:00,5106961,1729784,2015-04-25 12:00:00
4,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,16:00:00,5107250,1729858,2015-04-25 16:00:00


Turnstile counts (entries and exits) are cumulative. In order to actually calculate the number of entries/exits per timestamp, some calculations need to be performed.

In [18]:
df_mta_clean.reset_index(inplace=True)
df_mta_clean[["PREV_DATETIME", "PREV_ENTRIES", "PREV_EXITS"]] = (
    df_mta_clean
    .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATETIME", "ENTRIES", "EXITS"]
    .apply(lambda grp: grp.shift(1)))

df_mta_clean.head()

  df_mta_clean


Unnamed: 0,index,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS
0,0,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,00:00:00,5106770,1729635,2015-04-25 00:00:00,NaT,,
1,1,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,04:00:00,5106810,1729649,2015-04-25 04:00:00,2015-04-25 00:00:00,5106770.0,1729635.0
2,2,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,08:00:00,5106835,1729680,2015-04-25 08:00:00,2015-04-25 04:00:00,5106810.0,1729649.0
3,3,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,12:00:00,5106961,1729784,2015-04-25 12:00:00,2015-04-25 08:00:00,5106835.0,1729680.0
4,4,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,16:00:00,5107250,1729858,2015-04-25 16:00:00,2015-04-25 12:00:00,5106961.0,1729784.0


In [19]:
df_mta_clean.drop('index', axis=1, inplace=True)

Since there is no hour previous to the first hour, this resulted in `NaN` for the first hour of data for each turnstile. Those rows should be dropped. This will also take care of any other `NaN` data.

In [20]:
df_mta_clean.dropna(axis=0, inplace=True)
df_mta_clean.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS
1,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,04:00:00,5106810,1729649,2015-04-25 04:00:00,2015-04-25 00:00:00,5106770.0,1729635.0
2,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,08:00:00,5106835,1729680,2015-04-25 08:00:00,2015-04-25 04:00:00,5106810.0,1729649.0
3,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,12:00:00,5106961,1729784,2015-04-25 12:00:00,2015-04-25 08:00:00,5106835.0,1729680.0
4,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,16:00:00,5107250,1729858,2015-04-25 16:00:00,2015-04-25 12:00:00,5106961.0,1729784.0
5,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,20:00:00,5107620,1729914,2015-04-25 20:00:00,2015-04-25 16:00:00,5107250.0,1729858.0


Calculates initial hourly entries and exits for each 4-hour timestamp.

In [21]:
df_mta_clean['HOURLY_ENTRIES'] = df_mta_clean["ENTRIES"] - df_mta_clean["PREV_ENTRIES"]
df_mta_clean['HOURLY_EXITS'] = df_mta_clean["EXITS"] - df_mta_clean["PREV_EXITS"]
df_mta_clean.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,HOURLY_ENTRIES,HOURLY_EXITS
1,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,04:00:00,5106810,1729649,2015-04-25 04:00:00,2015-04-25 00:00:00,5106770.0,1729635.0,40.0,14.0
2,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,08:00:00,5106835,1729680,2015-04-25 08:00:00,2015-04-25 04:00:00,5106810.0,1729649.0,25.0,31.0
3,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,12:00:00,5106961,1729784,2015-04-25 12:00:00,2015-04-25 08:00:00,5106835.0,1729680.0,126.0,104.0
4,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,16:00:00,5107250,1729858,2015-04-25 16:00:00,2015-04-25 12:00:00,5106961.0,1729784.0,289.0,74.0
5,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,20:00:00,5107620,1729914,2015-04-25 20:00:00,2015-04-25 16:00:00,5107250.0,1729858.0,370.0,56.0


Calculate median entry and exit values to replace outlier data issues

In [24]:
#using DAYOFWEEK/TIME over the DATE/TIME, as the turnstile entries vary by day of week, 
#so the median will be more accurate
df_mta_clean['DAYOFWEEK'] = df_mta_clean['DATETIME'].dt.dayofweek

mta_medians = (df_mta_clean
                     .groupby(["C/A", "UNIT", "SCP", "STATION", "DAYOFWEEK", "TIME"])
                     .agg(ENTRIES_MEDIAN=('HOURLY_ENTRIES', 'median'), 
                          EXITS_MEDIAN=('HOURLY_EXITS', 'median'))
                     .reset_index()
                     )

In [25]:
df_mta_clean = df_mta_clean.merge(
                    mta_medians, 
                    how="left", 
                    on=["C/A", "UNIT", "SCP", "STATION", "DAYOFWEEK", "TIME"]
                    )

In [36]:
df_mta_clean['HOURLY_ENTRIES'] = df_mta_clean.apply(clean_hourly_turnstile_traffic, axis=1, reset_limit=4000)
df_mta_clean.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,HOURLY_ENTRIES,HOURLY_EXITS,DAYOFWEEK,ENTRIES_MEDIAN,EXITS_MEDIAN,HOURLY_ENTRIES_v2
0,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,04:00:00,5106810,1729649,2015-04-25 04:00:00,2015-04-25 00:00:00,5106770.0,1729635.0,40.0,14.0,5,31.0,7.0,40.0
1,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,08:00:00,5106835,1729680,2015-04-25 08:00:00,2015-04-25 04:00:00,5106810.0,1729649.0,25.0,31.0,5,21.0,26.0,25.0
2,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,12:00:00,5106961,1729784,2015-04-25 12:00:00,2015-04-25 08:00:00,5106835.0,1729680.0,126.0,104.0,5,110.0,99.0,126.0
3,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,16:00:00,5107250,1729858,2015-04-25 16:00:00,2015-04-25 12:00:00,5106961.0,1729784.0,289.0,74.0,5,258.0,71.0,289.0
4,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,20:00:00,5107620,1729914,2015-04-25 20:00:00,2015-04-25 16:00:00,5107250.0,1729858.0,370.0,56.0,5,371.0,54.0,370.0


In [37]:
df_mta_clean['HOURLY_EXITS'] = df_mta_clean.apply(clean_hourly_turnstile_traffic, axis=1, reset_limit=4000, 
                                                  entries=False)
df_mta_clean.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,HOURLY_ENTRIES,HOURLY_EXITS,DAYOFWEEK,ENTRIES_MEDIAN,EXITS_MEDIAN,HOURLY_ENTRIES_v2
0,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,04:00:00,5106810,1729649,2015-04-25 04:00:00,2015-04-25 00:00:00,5106770.0,1729635.0,40.0,14.0,5,31.0,7.0,40.0
1,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,08:00:00,5106835,1729680,2015-04-25 08:00:00,2015-04-25 04:00:00,5106810.0,1729649.0,25.0,31.0,5,21.0,26.0,25.0
2,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,12:00:00,5106961,1729784,2015-04-25 12:00:00,2015-04-25 08:00:00,5106835.0,1729680.0,126.0,104.0,5,110.0,99.0,126.0
3,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,16:00:00,5107250,1729858,2015-04-25 16:00:00,2015-04-25 12:00:00,5106961.0,1729784.0,289.0,74.0,5,258.0,71.0,289.0
4,A002,R051,02-00-00,LEXINGTON AVE,NQR456,BMT,04/25/2015,20:00:00,5107620,1729914,2015-04-25 20:00:00,2015-04-25 16:00:00,5107250.0,1729858.0,370.0,56.0,5,371.0,54.0,370.0


Since we set remaining outlier values that weren't fixed with the median to NaN, we need to drop NaN values again

In [40]:
df_mta_clean.dropna(axis=0, inplace=True)

In [41]:
df_mta_clean.describe(include='all')

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,DATETIME,PREV_DATETIME,PREV_ENTRIES,PREV_EXITS,HOURLY_ENTRIES,HOURLY_EXITS,DAYOFWEEK,ENTRIES_MEDIAN,EXITS_MEDIAN,HOURLY_ENTRIES_v2
count,10334848,10334848,10334848,10334848,10334848.0,10334848,10334848,10334848,10334850.0,10334850.0,10334848,10334848,10334850.0,10334850.0,10334850.0,10334850.0,10334850.0,10334850.0,10334850.0,10334850.0
unique,752,471,241,572,128.0,6,364,85783,,,656130,656118,,,,,,,,
top,PTH22,R549,00-00-00,34 ST-PENN STA,1.0,IRT,04/29/2019,04:00:00,,,2020-06-03 04:00:00,2020-06-03 04:00:00,,,,,,,,
freq,62110,140205,1003570,219221,1243509.0,3763381,30961,885494,,,2538,2538,,,,,,,,
first,,,,,,,,,,,2015-04-25 04:00:00,2015-04-25 00:00:00,,,,,,,,
last,,,,,,,,,,,2020-06-26 23:58:45,2020-06-26 19:46:45,,,,,,,,
mean,,,,,,,,,38258870.0,31107810.0,,,38251940.0,31102060.0,154.9109,120.3064,2.990846,167.3644,129.0236,154.8991
std,,,,,,,,,204236200.0,185566700.0,,,204216300.0,185548000.0,246.6164,219.1696,1.999328,435867.3,3398.498,246.6238
min,,,,,,,,,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,-990806000.0,-7045118.0,-3428.5
25%,,,,,,,,,446601.0,207074.0,,,446575.0,207063.0,6.0,5.0,1.0,10.5,8.0,6.0


In [42]:
df_mta_clean.to_pickle('cleaned_mta_turnstile_data_mayjune_20152020_median.pkl')