In [4]:
import ScrapeEmail as sel
import pandas as pd

# Pulling Line 1 data

In [13]:
def threshold_merge(new_df, old_df, cutoff_date):
    """Merge new data with old data. Only update records after cutoff_date"""
    if old_df.empty:
        return new_df.copy()
    
    # Keep old data before cutoff unchanged
    old_keep = old_df[pd.to_datetime(old_df['Date']) < pd.to_datetime(cutoff_date)]
    
    # Only use new data after cutoff
    new_after_cutoff = new_df[pd.to_datetime(new_df['Date']) >= pd.to_datetime(cutoff_date)]
    
    # Combine and sort
    combined = pd.concat([new_after_cutoff, old_keep], ignore_index=True)
    combined['Date'] = pd.to_datetime(combined['Date'])
    
    return combined.sort_values('Date', ascending=False).reset_index(drop=True)

def keep_latest_cycle_per_year(df, date_col="Date", cycle_col="Cycle"):
    """Keep only the latest record for each cycle within each year"""
    df = df.copy()
    df["Gas Transit Days"] = df["Gas Days"] + df["Gas Hours"] / 24
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    # Group by year and cycle, then find the index of the latest date for each group
    idx = df.groupby([df[date_col].dt.year, df[cycle_col]])[date_col].idxmax()
    return df.loc[idx].sort_values(date_col).reset_index(drop=True)

line1_df_new = sel.extract_colonial_transit_times(from_location='HTN', to_location='GBJ', target_subject='T4 Bulletin: Colonial - TRANSIT ')
line1_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line1')
line1_df = threshold_merge(line1_df_new, line1_df_old, cutoff_date='2025-08-01')
line1_df = keep_latest_cycle_per_year(df=line1_df, date_col='Date', cycle_col = 'Cycle')


# Pulling Line 3 data

In [15]:
line3_df_new = sel.extract_colonial_transit_times(from_location='GBJ', to_location='LNJ', target_subject='T4 Bulletin: Colonial - TRANSIT ')
line3_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line3')
line3_df = threshold_merge(line1_df_new, line1_df_old, cutoff_date='2025-08-01')
line3_df = keep_latest_cycle_per_year(df=line3_df, date_col='Date', cycle_col = 'Cycle')
line3_df


Unnamed: 0,Date,From,To,Cycle,Gas Days,Gas Hours,Distillates Days,Distillates Hours,Gas Transit Days
0,2024-08-22,HTN,GBJ,46,8,7,7.0,7.0,8.291667
1,2024-08-26,HTN,GBJ,47,7,6,7.0,10.0,7.250000
2,2024-08-29,HTN,GBJ,48,8,18,9.0,10.0,8.750000
3,2024-09-04,HTN,GBJ,49,7,6,9.0,10.0,7.250000
4,2024-09-08,HTN,GBJ,50,8,23,9.0,18.0,8.958333
...,...,...,...,...,...,...,...,...,...
72,2025-08-10,HTN,GBJ,45,9,3,10,7,9.125000
73,2025-08-17,HTN,GBJ,46,9,6,9,14,9.250000
74,2025-08-24,HTN,GBJ,47,8,20,7,8,8.833333
75,2025-08-26,HTN,GBJ,48,8,3,9,1,8.125000


# Pulling Line 1 & 3 data

In [22]:
line13_df_new = sel.extract_colonial_transit_times(from_location='HTN', to_location='LNJ', target_subject='T4 Bulletin: Colonial - TRANSIT ')
line13_df_new = keep_latest_cycle_per_year(df=line13_df_new, date_col='Date', cycle_col = 'Cycle')
line13_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line13')

# line13_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line13')


In [30]:
line13_df_old = pd.read_excel('colonial_transit_time.xlsx', sheet_name='line13')
line13_df_old

Unnamed: 0,Date,From,To,Cycle,Gas Days,Gas Hours,Distillates Hours,Distillates Hours.1,Gas Transit Days,year
0,NaT,HTN,LNJ,38,,,,,18.317845,2020
1,NaT,HTN,LNJ,39,,,,,18.194784,2020
2,NaT,HTN,LNJ,40,,,,,17.215023,2020
3,NaT,HTN,LNJ,41,,,,,17.929748,2020
4,NaT,HTN,LNJ,42,,,,,20.228789,2020
...,...,...,...,...,...,...,...,...,...,...
365,2025-08-03,HTN,LNJ,43,13.0,8.0,11.0,5.0,13.333333,2025
366,2025-08-06,HTN,LNJ,44,13.0,5.0,11.0,3.0,13.208333,2025
367,2025-08-10,HTN,LNJ,45,14.0,2.0,14.0,9.0,14.083333,2025
368,2025-08-17,HTN,LNJ,46,14.0,11.0,13.0,17.0,14.458333,2025


In [33]:
def fill_missing_dates_and_gas_data(df):
    """Fill missing dates based on cycle/year and backfill gas days/hours"""
    
    df = df.copy()
    
    # Fill missing dates based on cycle and year
    # 72 cycles per year = 6 cycles per month
    # Calculate month from cycle: month = ceil(cycle / 6)
    df['month'] = ((df['Cycle'] - 1) // 6) + 1
    df['month'] = df['month'].clip(1, 12)  # Ensure month is 1-12
    
    # Create date for missing Date values (NaT)
    mask_missing_date = df['Date'].isna()
    df.loc[mask_missing_date, 'Date'] = pd.to_datetime(
        df.loc[mask_missing_date, 'year'].astype(str) + '-' + 
        df.loc[mask_missing_date, 'month'].astype(str) + '-01'
    )
    
    # Backfill Gas Days and Gas Hours using Gas Transit Days
    # Gas Transit Days = Gas Days + (Gas Hours / 24)
    # So: Gas Days = floor(Gas Transit Days), Gas Hours = (Gas Transit Days - Gas Days) * 24
    
    mask_missing_gas = df['Gas Days'].isna() | df['Gas Hours'].isna()
    
    if mask_missing_gas.any():
        df.loc[mask_missing_gas, 'Gas Days'] = df.loc[mask_missing_gas, 'Gas Transit Days'].apply(
            lambda x: int(x) if pd.notna(x) else pd.NA
        )
        df.loc[mask_missing_gas, 'Gas Hours'] = df.loc[mask_missing_gas].apply(
            lambda row: (row['Gas Transit Days'] - int(row['Gas Transit Days'])) * 24 
            if pd.notna(row['Gas Transit Days']) else pd.NA, axis=1
        )
    
    # Drop the temporary month column
    df = df.drop('month', axis=1)
    
    return df
line13_df_old_v2 = fill_missing_dates_and_gas_data(line13_df_old)
line13_df_old_v2

Unnamed: 0,Date,From,To,Cycle,Gas Days,Gas Hours,Distillates Hours,Distillates Hours.1,Gas Transit Days,year
0,2020-07-01,HTN,LNJ,38,18.0,7.628272,,,18.317845,2020
1,2020-07-01,HTN,LNJ,39,18.0,4.674825,,,18.194784,2020
2,2020-07-01,HTN,LNJ,40,17.0,5.160556,,,17.215023,2020
3,2020-07-01,HTN,LNJ,41,17.0,22.313956,,,17.929748,2020
4,2020-07-01,HTN,LNJ,42,20.0,5.490937,,,20.228789,2020
...,...,...,...,...,...,...,...,...,...,...
365,2025-08-03,HTN,LNJ,43,13.0,8.000000,11.0,5.0,13.333333,2025
366,2025-08-06,HTN,LNJ,44,13.0,5.000000,11.0,3.0,13.208333,2025
367,2025-08-10,HTN,LNJ,45,14.0,2.000000,14.0,9.0,14.083333,2025
368,2025-08-17,HTN,LNJ,46,14.0,11.000000,13.0,17.0,14.458333,2025


In [35]:
with pd.ExcelWriter('filename.xlsx') as writer:
   df1.to_excel(writer, sheet_name='Sheet1', index=False)
   df2.to_excel(writer, sheet_name='Sheet2', index=False)
   df3.to_excel(writer, sheet_name='Sheet3', index=False)