In [1]:
import os
import pandas as pd
import numpy as np
import re

In [10]:
def prep_csv(input_path, output_path = None):
    for filename in os.listdir(input_path):
        if filename.endswith('.csv'):
            full_input_path = os.path.join(input_path, filename)
            df = pd.read_csv(full_input_path)

            # Rename columns
            df = df.rename(columns={
                'ServiceDate': 'service_date',
                'Route': 'route_id',
                'Direction': 'direction_id',
                'HalfTripId': 'half_trip_id',
                'Stop': 'stop_id',
                'Timepoint': 'time_point_id',
                'TimepointOrder': 'time_point_order',
                'PointType': 'point_type',
                'StandardType': 'standard_type',
                'Scheduled': 'scheduled',
                'Actual': 'actual',
                'ScheduledHeadway': 'scheduled_headway',
                'Headway': 'headway',
                'direction': 'direction_id'  # in case it's lowercase in some files
            })

            # Drop the 'earliness' column if it exists
            if 'earliness' in df.columns:
                df = df.drop('earliness', axis=1)

            # Save to new location
            full_output_path = os.path.join(output_path, filename)
            df.to_csv(full_output_path, index=False)
            

In [None]:
inp_path = None
out_path = None
prep_csv(inp_path, out_path)

In [251]:
root_directory = path_to_saved_csvs

combined_df = []

# Walk through all the folders and subfolders in the root directory
for foldername, subfolders, filenames in os.walk(root_directory):
    for filename in filenames:
        # Check if the file is a CSV
        if filename.endswith('.csv'):
            # Construct the full file path (including subfolders)
            file_path = os.path.join(foldername, filename)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Append the DataFrame to the list
            combined_df.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(combined_df, ignore_index=True)

# Save the combined DataFrame to a new CSV file
final_df.to_csv('data/Arrival_Departure/Working/covid.csv', index=False)

print("CSV files combined successfully!")

CSV files combined successfully!


In [2]:
d = 'data/Arrival_Departure/Working/covid.csv'
df = pd.read_csv(d)
print(df.dtypes)
df['direction_id'] = df['direction_id'].astype(str)
df['time_point_id'] = df['time_point_id'].astype(str)
df['route_id'] = df['route_id'].astype(str)
df['point_type'] = df['point_type'].astype(str)
df['standard_type'] = df['standard_type'].astype(str)
print(df.dtypes)


service_date          object
route_id              object
direction_id          object
half_trip_id         float64
stop_id              float64
time_point_id         object
time_point_order     float64
point_type            object
standard_type         object
scheduled             object
actual                object
scheduled_headway    float64
headway              float64
dtype: object
service_date          object
route_id              object
direction_id          object
half_trip_id         float64
stop_id              float64
time_point_id         object
time_point_order     float64
point_type            object
standard_type         object
scheduled             object
actual                object
scheduled_headway    float64
headway              float64
dtype: object


In [None]:
data['scheduled'] = pd.to_datetime(data['scheduled'], errors='coerce')
data['actual'] = pd.to_datetime(data['actual'], errors='coerce')

# Calculate delay (difference between actual and scheduled departure times)
data['delay'] = (data['actual'] - data['scheduled']).dt.total_seconds()

# Calculate the wait time for each trip (difference between actual departure and previous scheduled departure)
# First, sort the data by route_id and service_date to ensure the time order is correct
data = data.sort_values(by=['route_id', 'service_date', 'half_trip_id', 'time_point_order'])

# Create a column for the previous trip's scheduled time
data['previous_scheduled'] = data.groupby(['route_id', 'service_date', 'half_trip_id'])['scheduled'].shift(1)

# Calculate wait time as the difference between the current trip's actual time and the previous trip's scheduled time
data['wait_time'] = (data['actual'] - data['previous_scheduled']).dt.total_seconds()

# For trips without a previous trip, set wait time as NaN
data['wait_time'] = data['wait_time'].fillna(0)

# Check the data after preprocessing
data[['service_date', 'route_id', 'scheduled', 'actual', 'delay', 'wait_time']].head()