### In this file, I just want to add some feature columns to the data that could be useful in our analysis in the future: 
1. Day of week
1. Time of day (Morning, Afternoon, Evening, Late)
1. Create a better column that represents the actual number of entries (Difficult!!)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df_raw = pd.read_csv('Turnstile_Usage_Data__2018.csv')
df_raw.head()

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,R232A,R176,03-06-00,33 ST,6,IRT,12/23/2017,20:00:00,REGULAR,102417472,68072550
1,R409,R449,01-00-02,E 149 ST,6,IRT,12/23/2017,08:00:00,REGULAR,1985376,6042304
2,R256,R182,00-00-02,116 ST,6,IRT,12/23/2017,04:00:00,REGULAR,765700,471312
3,R246,R177,00-03-02,68ST-HUNTER CO,6,IRT,12/23/2017,20:00:00,REGULAR,14882041,10581878
4,R227,R131,00-00-05,23 ST,6,IRT,12/23/2017,08:00:00,REGULAR,1524615,537740


In [4]:
df_working = df_raw.copy()

### 1. Add a weekday Column

In [5]:
def Add_Weekday(data_frame, column):
    dmap = {0:'Mon', 1: 'Tue', 2: 'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
    #Convert column to date
    data_frame[column] = pd.to_datetime(data_frame[column])
    #Add column which shows the Weekday in integer
    data_frame['Day_Number'] = data_frame[column].apply(lambda x: x.dayofweek)
    #Add column which shows the Weekday in words
    data_frame['Weekday'] = data_frame['Day_Number'].map(dmap)
    return data_frame

In [6]:
df_working = Add_Weekday(df_working, 'Date')

In [176]:
df_working.sample(5)

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits,Day_Number,Weekday,Hour,Time_of_Day,unique_ident
221128,R231,R176,00-00-00,33 ST,6,IRT,2018-05-10,09:00:00,REGULAR,5014550,3820571,3,Thu,9,Worktime,R231R17600-00-00
514457,R423,R429,00-03-01,PELHAM BAY PARK,6,IRT,2018-11-21,15:00:00,REGULAR,16849878,9718802,2,Wed,15,Worktime,R423R42900-03-01
393131,R407,R448,01-00-01,E 143/ST MARY'S,6,IRT,2018-08-22,09:00:00,REGULAR,206540,532465,2,Wed,9,Worktime,R407R44801-00-01
269137,R406,R448,00-00-00,E 143/ST MARY'S,6,IRT,2018-06-08,13:00:00,REGULAR,2520421,1317154,4,Fri,13,Worktime,R406R44800-00-00
406843,R252,R180,00-03-00,103 ST,6,IRT,2018-08-30,09:00:00,REGULAR,244513,317397,3,Thu,9,Worktime,R252R18000-03-00


### 2. Add a Time of Day column

In [181]:
print(type(df_working['Time'][0]))
#print(df_working['Time'].unique()) #There are several times of day present

<class 'str'>


In [178]:
def DayTime(data_frame, column):
    #Extract the Hour from the Time Column
    data_frame['Hour'] = data_frame[column].apply(lambda x: int(x.split(':')[0]))
    #Use the Hour column to create a Column which distinguishes time of day
    data_frame.loc[(data_frame['Hour'] >= 8) & (data_frame['Hour'] <= 18), 'Time_of_Day'] = 'Worktime'
    data_frame.loc[(data_frame['Hour'] > 18) & (data_frame['Hour'] <= 24), 'Time_of_Day'] = 'Evening'
    data_frame.loc[(data_frame['Hour'] < 8), 'Time_of_Day'] = 'Late'
    
    return data_frame

In [179]:
df_working = DayTime(df_working, 'Time')

In [180]:
df_working.sample(5)

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits,Day_Number,Weekday,Hour,Time_of_Day,unique_ident
158620,R405,R447,01-00-01,CYPRESS AV,6,IRT,2018-04-03,05:00:00,REGULAR,679602,955625,1,Tue,5,Late,R405R44701-00-01
120280,R421,R427,00-06-01,MIDDLETOWN RD,6,IRT,2018-03-04,16:00:00,REGULAR,0,196,6,Sun,16,Worktime,R421R42700-06-01
365867,R408,R449,00-00-02,E 149 ST,6,IRT,2018-08-05,21:00:00,REGULAR,4890859,2796869,6,Sun,21,Evening,R408R44900-00-02
323385,R226,R131,02-00-00,23 ST,6,IRT,2018-07-10,17:00:00,REGULAR,1998541024,1545580931,1,Tue,17,Worktime,R226R13102-00-00
562573,R227A,R131,01-00-00,23 ST,6,IRT,2018-12-21,04:00:00,REGULAR,273988,161113,4,Fri,4,Late,R227AR13101-00-00


### 3: Figure out how to get a starting point for Entries (Unsure of this method!):
1. Get starting point at August 1st 00:00:00 for every station
1. Create a unique Identifier column which is basically combo of CA, Unit and SCP
1. Create a new DF (df_init) with just the columns of Entries and that unique Identifier
1. Merge that df_init and the main df_working dataframe
1. Create a column which takes the difference between the Cumulative Entries column and the Initial Entries column from

In [155]:
df_aug = df_working.copy()
df_working['unique_ident']= df_working['C/A'] + df_working['Unit']+df_working['SCP'] 

In [156]:
#Create a unique identifier column
df_aug['unique_ident'] = df_aug['C/A'] + df_aug['Unit']+df_aug['SCP'] 

In [157]:
df_aug['unique_ident'].value_counts().sort_values

<bound method Series.sort_values of R417R22200-00-01    2192
R417R22200-03-03    2191
R417R22200-03-02    2191
R417R22200-00-02    2191
R417R22200-03-01    2191
                    ... 
OB01R45900-00-00     691
OB01R45900-00-01     691
OB01R45900-03-01     691
OB01R45900-00-02     690
OB01R45900-03-00     686
Name: unique_ident, Length: 283, dtype: int64>

In [158]:
df_aug.head()

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits,Day_Number,Weekday,Hour,Time_of_Day,unique_ident
0,R232A,R176,03-06-00,33 ST,6,IRT,2017-12-23,20:00:00,REGULAR,102417472,68072550,5,Sat,20,Evening,R232AR17603-06-00
1,R409,R449,01-00-02,E 149 ST,6,IRT,2017-12-23,08:00:00,REGULAR,1985376,6042304,5,Sat,8,Worktime,R409R44901-00-02
2,R256,R182,00-00-02,116 ST,6,IRT,2017-12-23,04:00:00,REGULAR,765700,471312,5,Sat,4,Late,R256R18200-00-02
3,R246,R177,00-03-02,68ST-HUNTER CO,6,IRT,2017-12-23,20:00:00,REGULAR,14882041,10581878,5,Sat,20,Evening,R246R17700-03-02
4,R227,R131,00-00-05,23 ST,6,IRT,2017-12-23,08:00:00,REGULAR,1524615,537740,5,Sat,8,Worktime,R227R13100-00-05


In [159]:
df_aug = df_aug[(df_aug['Date'] == '2018-08-01')]

In [160]:
#It looks like we lost a few stations, For now we can assume that the lost ones were low 
#traffic since they did not have any data collected on 8/1/2018 
df_aug['unique_ident'].value_counts().sort_values

<bound method Series.sort_values of R252R18000-03-02    26
R252R18000-00-00    26
R252R18000-03-01    26
R252R18000-00-02    26
R252R18000-00-01    26
                    ..
R409R44901-00-02     6
R246R17700-03-02     6
R248R17800-00-03     6
R419R32600-03-00     6
R418R10600-00-00     5
Name: unique_ident, Length: 264, dtype: int64>

In [161]:
df_aug = df_aug.drop_duplicates(subset=['unique_ident'], keep='first')

In [162]:
#Good to go!
df_aug['unique_ident'].value_counts().sort_values

<bound method Series.sort_values of R420R10700-00-00    1
R406R44800-00-00    1
R414R16200-03-00    1
R242R04901-03-03    1
R412R14600-06-01    1
                   ..
R416R24500-03-01    1
R409R44901-00-02    1
R246R17700-03-02    1
R248R17800-00-03    1
R419R32600-03-00    1
Name: unique_ident, Length: 264, dtype: int64>

In [163]:
df_init = df_aug[['Entries', 'unique_ident']].copy()

In [165]:
df_join = df_working.merge(df_init, on='unique_ident')

In [171]:
df_join['Entries_Difference'] = df_join['Entries_x'] - df_join['Entries_y']

In [175]:
df_join.sample(10)

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries_x,Exits,Day_Number,Weekday,Hour,Time_of_Day,unique_ident,Entries_y,Entries_Difference
511239,R401,R445,00-00-01,3 AV 138 ST,6,IRT,2018-09-23,17:00:00,REGULAR,1144227,1097591,6,Sun,17,Worktime,R401R44500-00-01,1068987,75240
544576,R227,R131,00-00-01,23 ST,6,IRT,2018-02-24,20:00:00,REGULAR,15382122,9803861,5,Sat,20,Evening,R227R13100-00-01,15623211,-241089
541089,R416,R245,00-03-00,ST LAWRENCE AV,6,IRT,2018-07-22,00:00:00,REGULAR,4803087,2204916,6,Sun,0,Late,R416R24500-03-00,4813335,-10248
540397,R416,R245,00-03-00,ST LAWRENCE AV,6,IRT,2018-03-29,12:00:00,REGULAR,4684826,2140617,3,Thu,12,Worktime,R416R24500-03-00,4813335,-128509
198387,R231,R176,00-00-02,33 ST,6,IRT,2018-04-20,01:00:00,REGULAR,11945282,5577797,4,Fri,1,Late,R231R17600-00-02,12064556,-119274
527473,R409,R449,01-00-01,E 149 ST,6,IRT,2018-03-31,13:00:00,REGULAR,2288201,3012386,5,Sat,13,Worktime,R409R44901-00-01,2333096,-44895
506068,R414,R162,00-00-01,ELDER AV,6,IRT,2018-04-26,17:00:00,REGULAR,669250,861384,3,Thu,17,Worktime,R414R16200-00-01,770201,-100951
514165,R220,R160,01-00-00,ASTOR PL,6,IRT,2018-01-24,20:00:00,REGULAR,880506,2743370,2,Wed,20,Evening,R220R16001-00-00,945253,-64747
422239,R417,R222,00-00-01,PARKCHESTER,6,IRT,2018-04-28,05:00:00,REGULAR,2058998,225491,5,Sat,5,Late,R417R22200-00-01,2380963,-321965
559217,OB01,R459,00-00-02,ORCHARD BEACH,6,IND,2018-06-02,12:00:00,REGULAR,95,8,5,Sat,12,Worktime,OB01R45900-00-02,121,-26
