In [1]:
# Imports
import pandas as pd
from glob import glob
from datetime import datetime, date, time, timedelta

In [2]:
# Read data files
list_of_data_files = glob('../data/*.txt')
list_of_data_files

['../data/turnstile_190330.txt',
 '../data/turnstile_190518.txt',
 '../data/turnstile_190525.txt',
 '../data/turnstile_190323.txt',
 '../data/turnstile_190309.txt',
 '../data/turnstile_190420.txt',
 '../data/turnstile_190427.txt',
 '../data/turnstile_190601.txt',
 '../data/turnstile_190406.txt',
 '../data/turnstile_190413.txt',
 '../data/turnstile_190511.txt',
 '../data/turnstile_190504.txt',
 '../data/turnstile_190316.txt',
 '../data/turnstile_190302.txt']

In [3]:
# Read data files into pandas dataframes
list_of_dataframes = [pd.read_csv(file, parse_dates=['DATE']) for file in list_of_data_files]
df = pd.concat(list_of_dataframes)
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,00:00:00,REGULAR,6989774,2370411
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,04:00:00,REGULAR,6989795,2370413
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,08:00:00,REGULAR,6989813,2370436
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,12:00:00,REGULAR,6989924,2370512
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,16:00:00,REGULAR,6990200,2370573


In [6]:
df.columns = [x.strip() for x in df.columns]
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [7]:
# Filter data to daylight hours
times = ['12:00:00', '16:00:00', '20:00:00']
df = df[df['TIME'].isin(times)]

In [8]:
df.head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,12:00:00,REGULAR,6989924,2370512
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,16:00:00,REGULAR,6990200,2370573
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,20:00:00,REGULAR,6990562,2370623
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-24,12:00:00,REGULAR,6990860,2370731
10,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-24,16:00:00,REGULAR,6991054,2370788
11,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-24,20:00:00,REGULAR,6991295,2370839
15,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-25,12:00:00,REGULAR,6991651,2371197
16,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-25,16:00:00,REGULAR,6991935,2371230
17,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-25,20:00:00,REGULAR,6992684,2371283
21,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-26,12:00:00,REGULAR,6993111,2371709


In [9]:
# Add a weekday column
df['WEEKDAY'] = df.apply(lambda row: row['DATE'].weekday(), axis=1)
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,WEEKDAY
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,12:00:00,REGULAR,6989924,2370512,5
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,16:00:00,REGULAR,6990200,2370573,5
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,20:00:00,REGULAR,6990562,2370623,5
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-24,12:00:00,REGULAR,6990860,2370731,6
10,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-24,16:00:00,REGULAR,6991054,2370788,6


In [11]:
# Calculate the differences between rows for entries and exits
df['EXIT_COUNT'] = df['EXITS'] - df['EXITS'].shift(1) 

In [12]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,WEEKDAY,EXIT_COUNT
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,12:00:00,REGULAR,6989924,2370512,5,
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,16:00:00,REGULAR,6990200,2370573,5,61.0
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-23,20:00:00,REGULAR,6990562,2370623,5,50.0
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-24,12:00:00,REGULAR,6990860,2370731,6,108.0
10,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-03-24,16:00:00,REGULAR,6991054,2370788,6,57.0


In [None]:
gd = df.groupby(['STATION', 'WEEKDAY', 'TIME'], as_index=False).count()

In [None]:
gd.head()