# Intitial Setup

In [1]:
import pandas as pd

path = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\raw_forecast_logs.tsv"
chunksize = 500000
cols = ['DateTime', 'Line', 'Origin', 'Direction', 'Destination', 'Due in', 'Minutes', 'Status Message']

snapshot_counts_chunks = []

for chunk in pd.read_csv(path, sep='\t', header=None, names=cols, 
                         chunksize=chunksize, on_bad_lines='skip', low_memory=False):

    chunk['DateTime'] = pd.to_datetime(chunk['DateTime'], format='%m/%d/%Y %H:%M:%S', errors='coerce')
    chunk['Minutes'] = pd.to_numeric(chunk['Minutes'], errors='coerce')
    chunk.dropna(subset=['DateTime', 'Minutes'], inplace=True)
    # Only append if chunk still has rows after filtering
    if not chunk.empty:
        snapshot_counts_chunks.append(chunk)

# Combine final data
df = pd.concat(snapshot_counts_chunks, ignore_index=True)


In [None]:
counts = df[["Origin", "Destination", "Direction", "Status Message"]]
direction_counts = df.groupby(["Direction", "Destination"]).size().unstack(fill_value=0)
print(direction_counts)


Destination      BEL     BLA       BRI       BRO      CON  DOM     HEU  \
Direction                                                                
Inbound to   1941473  148501         0  23485371  8897689    0       0   
Outbound to        0       0  27205800         0        0  465  249911   

Destination       PAR     RED       SAG       SAN   STS       TAL       TPT  
Direction                                                                    
Inbound to   16348033       0         0         0  3851         0  28771267  
Outbound to         0  342470  17067873  13025550     0  23399466         0  


In [None]:
df['ServiceDay'] = pd.to_datetime(df['ServiceDay'])

pre_covid = df[(df['ServiceDay'] >= '2020-01-20') & (df['ServiceDay'] < '2020-03-21')]
covid_lockdown = df[(df['ServiceDay'] >= '2020-03-21') & (df['ServiceDay'] <= '2021-05-31')]
covid_recovery = df[(df['ServiceDay'] >= '2021-06-01') & (df['ServiceDay'] <= '2022-01-21')]
post_covid = df[(df['ServiceDay'] >= '2022-01-22') & (df['ServiceDay'] <= '2022-10-22')]


In [4]:
print(f"Pre-COVID: {len(pre_covid):,} rows")
print(f"COVID Lockdown: {len(covid_lockdown):,} rows")
print(f"COVID Recovery: {len(covid_recovery):,} rows")
print(f"Post-COVID: {len(post_covid):,} rows")


Pre-COVID: 10,536,052 rows
COVID Lockdown: 70,119,473 rows
COVID Recovery: 35,917,583 rows
Post-COVID: 43,612,616 rows


In [None]:
import os
ouput_dir = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods"
os.makedirs(output_dir, exist_ok=True)

pre_covid.to_csv(f"{output_dir}/pre_covid.csv", index=False)
covid_lockdown.to_csv(f"{output_dir}/covid_lockdown.csv", index=False)
covid_recovery.to_csv(f"{output_dir}/covid_recovery.csv", index=False)
post_covid.to_csv(f"{output_dir}/post_covid.csv", index=False)


In [None]:
import os
import pandas as pd

# File path and output directory
file_path = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\precovid\pre_covid.csv"
output_dir = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\precovid\months"
os.makedirs(output_dir, exist_ok=True)

chunksize = 500_000
written_files = set()

# Process and save each chunk individually
for chunk in pd.read_csv(file_path, chunksize=chunksize, parse_dates=["DateTime", "ServiceDay"]):
    chunk['Month'] = chunk['ServiceDay'].dt.strftime('%B').str.lower()
    chunk['Year'] = chunk['ServiceDay'].dt.year

    for (year, month), group in chunk.groupby(['Year', 'Month']):
        filename = f"pre_covid_{month}_{year}.csv"
        filepath = os.path.join(output_dir, filename)

        write_header = filename not in written_files
        group.to_csv(filepath, mode='a', header=write_header, index=False)
        written_files.add(filename)

        print(f"Appended {len(group):,} rows to {filename}")


Appended 500,000 rows to pre_covid_january_2020.csv
Appended 500,000 rows to pre_covid_january_2020.csv
Appended 500,000 rows to pre_covid_january_2020.csv
Appended 500,000 rows to pre_covid_january_2020.csv
Appended 337,732 rows to pre_covid_february_2020.csv
Appended 162,268 rows to pre_covid_january_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 500,000 rows to pre_covid_february_2020.csv
Appended 177,209 rows to pre_covid_february_2020.csv
Appended 322,791 rows to pre_covid_march_2020.csv
Appended 500,000 rows to pre_covid_march_2020.csv
Appended 500,000 rows to pre_covid_march_2020.csv
Appende

In [None]:
import os
import pandas as pd

# File path and output directory
file_path = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\lockdown\covid_lockdown.csv"
output_dir = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\lockdown\months"
os.makedirs(output_dir, exist_ok=True)

chunksize = 500_000
written_files = set()

# Process and save each chunk individually
for chunk in pd.read_csv(file_path, chunksize=chunksize, parse_dates=["DateTime", "ServiceDay"]):
    chunk['Month'] = chunk['ServiceDay'].dt.strftime('%B').str.lower()
    chunk['Year'] = chunk['ServiceDay'].dt.year

    for (year, month), group in chunk.groupby(['Year', 'Month']):
        filename = f"lockdown_{month}_{year}.csv"
        filepath = os.path.join(output_dir, filename)

        write_header = filename not in written_files
        group.to_csv(filepath, mode='a', header=write_header, index=False)
        written_files.add(filename)

        print(f"Appended {len(group):,} rows to {filename}")


Appended 500,000 rows to lockdown_march_2020.csv
Appended 500,000 rows to lockdown_march_2020.csv
Appended 500,000 rows to lockdown_march_2020.csv
Appended 219,207 rows to lockdown_april_2020.csv
Appended 280,793 rows to lockdown_march_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 500,000 rows to lockdown_april_2020.csv
Appended 145,279 rows to lockdown_april_2020.csv
Appended 354,721 rows to lockdown_may_2020.csv
Appended 500,000 rows to lockdown_may_2020.csv
Appended 500,000 rows to lockdown_may_2020.csv
Appended 500,000 rows to lockdown_may_2020.csv
Appended 500,000 rows to lockdown_may_2020.csv
Appended 500,000 rows to lockd

In [None]:
import os
import pandas as pd

# File path and output directory
file_path = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\recovery\covid_recovery.csv"
output_dir = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\recovery\months"
os.makedirs(output_dir, exist_ok=True)

chunksize = 500_000
written_files = set()

# Process and save each chunk individually
for chunk in pd.read_csv(file_path, chunksize=chunksize, parse_dates=["DateTime", "ServiceDay"]):
    chunk['Month'] = chunk['ServiceDay'].dt.strftime('%B').str.lower()
    chunk['Year'] = chunk['ServiceDay'].dt.year

    for (year, month), group in chunk.groupby(['Year', 'Month']):
        filename = f"recovery_{month}_{year}.csv"
        filepath = os.path.join(output_dir, filename)

        write_header = filename not in written_files
        group.to_csv(filepath, mode='a', header=write_header, index=False)
        written_files.add(filename)

        print(f"Appended {len(group):,} rows to {filename}")


Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_june_2021.csv
Appended 270,064 rows to recovery_july_2021.csv
Appended 229,936 rows to recovery_june_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 500,000 rows to recovery_july_2021.csv
Appended 419,498 rows to recovery_august

In [None]:
import os
import pandas as pd

# File path and output directory
file_path = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\postcovid\post_covid.csv"
output_dir = r"C:\Users\athen\Documents\GitHub\TCD_Dissertation\archive\periods\postcovid\months"
os.makedirs(output_dir, exist_ok=True)

chunksize = 500_000
written_files = set()

# Process and save each chunk individually
for chunk in pd.read_csv(file_path, chunksize=chunksize, parse_dates=["DateTime", "ServiceDay"]):
    chunk['Month'] = chunk['ServiceDay'].dt.strftime('%B').str.lower()
    chunk['Year'] = chunk['ServiceDay'].dt.year

    for (year, month), group in chunk.groupby(['Year', 'Month']):
        filename = f"postcovid_{month}_{year}.csv"
        filepath = os.path.join(output_dir, filename)

        write_header = filename not in written_files
        group.to_csv(filepath, mode='a', header=write_header, index=False)
        written_files.add(filename)

        print(f"Appended {len(group):,} rows to {filename}")


Appended 500,000 rows to postcovid_january_2022.csv
Appended 500,000 rows to postcovid_january_2022.csv
Appended 48,765 rows to postcovid_february_2022.csv
Appended 451,235 rows to postcovid_january_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 500,000 rows to postcovid_february_2022.csv
Appended 280,967 rows to postcovid_february_2022.csv
Appended 219,033 rows to postcovid_march_2022.csv
Appended 500,000 rows to postcovid_march_2022.csv
Appended 500,000 rows to postcovid_march_2022.csv
Appended 500,000 rows to postcovid_march_2022.csv
Appended 500,000 rows to postcovid_march_2022.csv
Appended 500,000 rows to postcovid_march_2022.csv
Appended 500,00