In [106]:
import pandas as pd
import glob
import os
from datetime import datetime, timedelta

In [169]:
# months = ["03", "04", "05", "06", "07", "08", "09"]
months = ["04"]

In [170]:
def clean_delay_to_minutes(sec):
    if sec < 0:
        return 0.0
    else:
        return sec/60.0

In [171]:
def adjust_actual_times(row):
    row["delay_minutes"] = clean_delay_to_minutes(row["delay_minutes"])

    if row["delay_minutes"] > 700:
        
        # if delay ~= 1400, parsing error, subtract full day
        if row["delay_minutes"] > 1400:
            row["actual_time"] = row["actual_time"] - timedelta(hours=24)
        
        # if delay ~= 720, parsing error, subtract half day
        else:
            row["actual_time"] = row["actual_time"] - timedelta(hours=12)
        
        # recompute delay
        row["delay_minutes"] = (row["actual_time"] - row["scheduled_time"]).total_seconds()
        row["delay_minutes"] = clean_delay_to_minutes(row["delay_minutes"])

        return row
    
    return None

In [172]:
for month in months:
    
    # read in CSVs for month
    path = "/Users/pranav/Documents/development/njtransit/csv/2018_{}".format(month)
    monthFiles = glob.glob(os.path.join(path, "*.csv"))
    df = pd.concat((pd.read_csv(f) for f in sorted(monthFiles)))
    print("done reading csv {}".format(month))
    
    # series-wide operations
    df.columns = ["train_id", "date", "stop_sequence", "from", "from_id", "to", "to_id", "scheduled_time", "actual_time", "status", "line", "type"]
    df['scheduled_time'] = pd.to_datetime(df['scheduled_time'])
    df['actual_time'] = pd.to_datetime(df['actual_time'])
    df['delay_minutes']=(df['actual_time']-df['scheduled_time']).dt.total_seconds()
    print("done series-wide {}".format(month))
    
    # adjust/clean times
    df_records = df.to_dict('records')
    for idx, record in enumerate(df_records):
        record = adjust_actual_times(record)
        if record is not None:
            df_records[idx] = record
    print("done adjust/clean {}".format(month))
    
    # generate cleaned df
    df = pd.DataFrame(df_records)
    columns = ["date", "train_id", "stop_sequence", "from", "from_id", "to", "to_id", "scheduled_time", "actual_time", "delay_minutes", "status", "line", "type"]
    df = df[columns]
    df.to_csv("/Users/pranav/Documents/development/njtransit/csv/monthly/2018_{}.csv".format(month), index=False)
    print("done generating df {}".format(month))

done reading csv 04
done series-wide 04
done adjust/clean 04
done generating df 04


In [173]:
df.head()

Unnamed: 0,date,train_id,stop_sequence,from,from_id,to,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
0,2018-04-01,7837,1.0,New York Penn Station,105,New York Penn Station,105,2018-04-01 11:14:00,2018-04-01 11:14:10,0.166667,departed,Northeast Corrdr,NJ Transit
1,2018-04-01,7837,2.0,New York Penn Station,105,Secaucus Upper Lvl,38187,2018-04-01 11:23:00,2018-04-01 11:26:05,3.083333,departed,Northeast Corrdr,NJ Transit
2,2018-04-01,7837,3.0,Secaucus Upper Lvl,38187,Newark Penn Station,107,2018-04-01 11:32:00,2018-04-01 11:36:07,4.116667,departed,Northeast Corrdr,NJ Transit
3,2018-04-01,7837,4.0,Newark Penn Station,107,Newark Airport,37953,2018-04-01 11:37:00,2018-04-01 11:40:08,3.133333,departed,Northeast Corrdr,NJ Transit
4,2018-04-01,7837,5.0,Newark Airport,37953,Elizabeth,41,2018-04-01 11:42:00,2018-04-01 11:46:04,4.066667,departed,Northeast Corrdr,NJ Transit
