In [1]:
import ScrapeEmail as sel
import pandas as pd

# Pulling Line 1 data

In [3]:
def threshold_merge(new_df, old_df, cutoff_date):
    """Merge new data with old data. Only update records after cutoff_date"""
    if old_df.empty:
        return new_df.copy()
    
    # Keep old data before cutoff unchanged
    old_keep = old_df[pd.to_datetime(old_df['Date']) < pd.to_datetime(cutoff_date)]
    
    # Only use new data after cutoff
    new_after_cutoff = new_df[pd.to_datetime(new_df['Date']) >= pd.to_datetime(cutoff_date)]
    
    # Combine and sort
    combined = pd.concat([new_after_cutoff, old_keep], ignore_index=True)
    combined['Date'] = pd.to_datetime(combined['Date'])
    
    return combined.sort_values('Date', ascending=False).reset_index(drop=True)

def keep_latest_cycle_per_year(df, date_col="Date", cycle_col="Cycle"):
    """Keep only the latest record for each cycle within each year"""
    df = df.copy()
    df["Gas Transit Days"] = df["Gas Days"] + df["Gas Hours"] / 24
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    # Group by year and cycle, then find the index of the latest date for each group
    idx = df.groupby([df[date_col].dt.year, df[cycle_col]])[date_col].idxmax()
    return df.loc[idx].sort_values(date_col).reset_index(drop=True)

line1_df_new = sel.extract_colonial_transit_times(from_location='HTN', to_location='GBJ', target_subject='T4 Bulletin: Colonial - TRANSIT ')
line1_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line1')
line1_df = threshold_merge(line1_df_new, line1_df_old, cutoff_date='2025-08-01')
line1_df = keep_latest_cycle_per_year(df=line1_df, date_col='Date', cycle_col = 'Cycle')
line1_df

Unnamed: 0,Date,From,To,Cycle,Gas Days,Gas Hours,Distillates Days,Distillates Hours,Gas Transit Days
0,2024-08-22,HTN,GBJ,46,8,7,7.0,7.0,8.291667
1,2024-08-26,HTN,GBJ,47,7,6,7.0,10.0,7.250000
2,2024-08-29,HTN,GBJ,48,8,18,9.0,10.0,8.750000
3,2024-09-04,HTN,GBJ,49,7,6,9.0,10.0,7.250000
4,2024-09-08,HTN,GBJ,50,8,23,9.0,18.0,8.958333
...,...,...,...,...,...,...,...,...,...
73,2025-08-17,HTN,GBJ,46,9,6,9,14,9.250000
74,2025-08-24,HTN,GBJ,47,8,20,7,8,8.833333
75,2025-08-27,HTN,GBJ,48,8,8,6,22,8.333333
76,2025-09-01,HTN,GBJ,49,9,6,10,8,9.250000


# Pulling Line 3 data

In [17]:
line3_df_new = sel.extract_colonial_transit_times(from_location='GBJ', to_location='LNJ', target_subject='T4 Bulletin: Colonial - TRANSIT ')
line3_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line3')
line3_df = threshold_merge(line3_df_new, line3_df_old, cutoff_date='2025-08-01')
line3_df = keep_latest_cycle_per_year(df=line3_df, date_col='Date', cycle_col = 'Cycle')
line3_df


Unnamed: 0,Date,From,To,Cycle,Gas Days,Gas Hours,Distillates Days,Distillates Hours,Gas Transit Days
0,2025-01-15,GBJ,HTN,3,7.0,17.0,3.0,11.0,7.708333
1,2025-01-21,GBJ,HTN,4,8.0,6.0,3.0,4.0,8.25
2,2025-01-26,GBJ,HTN,5,8.0,5.0,,,8.208333
3,2025-02-02,GBJ,HTN,6,7.0,3.0,3.0,5.0,7.125
4,2025-02-05,GBJ,HTN,7,5.0,8.0,2.0,18.0,5.333333
5,2025-02-10,GBJ,HTN,8,3.0,5.0,3.0,11.0,3.208333
6,2025-02-16,GBJ,HTN,9,3.0,12.0,2.0,23.0,3.5
7,2025-02-18,GBJ,HTN,10,2.0,9.0,2.0,17.0,2.375
8,2025-02-23,GBJ,HTN,11,3.0,8.0,3.0,19.0,3.333333
9,2025-02-28,GBJ,HTN,12,4.0,23.0,1.0,22.0,4.958333


# Pulling Line 1 & 3 data

In [15]:
line13_df_new = sel.extract_colonial_transit_times(from_location='HTN', to_location='LNJ', target_subject='T4 Bulletin: Colonial - TRANSIT ')
line13_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line13')
line13_df = threshold_merge(line13_df_new, line13_df_old, cutoff_date='2025-08-01')
line13_df = keep_latest_cycle_per_year(df=line13_df, date_col='Date', cycle_col = 'Cycle')
line13_df


Unnamed: 0,Date,From,To,Cycle,Gas Days,Gas Hours,Distillates Days,Distillates Hours,Gas Transit Days
0,2020-07-01,HTN,LNJ,38,18,8,,,18.333333
1,2020-07-01,HTN,LNJ,39,18,5,,,18.208333
2,2020-07-01,HTN,LNJ,40,17,5,,,17.208333
3,2020-07-01,HTN,LNJ,41,17,22,,,17.916667
4,2020-07-01,HTN,LNJ,42,20,5,,,20.208333
...,...,...,...,...,...,...,...,...,...
368,2025-08-17,HTN,LNJ,46,14,11,13.0,17.0,14.458333
369,2025-08-24,HTN,LNJ,47,13,18,10.0,19.0,13.750000
370,2025-08-27,HTN,LNJ,48,13,13,7.0,20.0,13.541667
371,2025-09-01,HTN,LNJ,49,15,10,11.0,5.0,15.416667


In [18]:
with pd.ExcelWriter('colonial_transit_time.xlsx') as writer:
   line1_df.to_excel(writer, sheet_name='line1', index=False)
   line3_df.to_excel(writer, sheet_name='line3', index=False)
   line13_df.to_excel(writer, sheet_name='line13', index=False)