## Scrape Inbox

## Scrape Indiv Email with Forwarded Emails

In [5]:
import re, warnings
import pandas as pd
import win32com.client
from io import StringIO
from datetime import timedelta
from dateutil import parser
from bs4 import BeautifulSoup

TARGET_SUBJ = "T4 Bulletin: Colonial - TRANSIT TIMES sent to sto_susan"

def _soup(html):
    try:    return BeautifulSoup(html, "lxml")
    except: return BeautifulSoup(html, "html.parser")

def _date_from_text(text, fallback):
    try:
        if "Date:" in text:
            part = text.split("Date:", 1)[1].split("\n", 1)[0]
            return parser.parse(part, fuzzy=True).date() - timedelta(days=1)
        return parser.parse(text, fuzzy=True).date() - timedelta(days=1)
    except Exception:
        return fallback - timedelta(days=1)

def _promote_header(df: pd.DataFrame) -> pd.DataFrame:
    # Make the first row containing From & To the header
    for i in range(min(8, len(df))):
        vals = [str(v).strip().lower() for v in df.iloc[i].values]
        if "from" in vals and "to" in vals:
            df.columns = df.iloc[i].astype(str).str.strip()
            return df.iloc[i+1:].reset_index(drop=True)
    return df

def _norm_code(s):
    return re.sub(r"[^A-Z]", "", str(s).upper())

def _first_four_numbers_right_of(df, row_idx, start_col_idx):
    """Scan a row from start_col_idx+1 to the end; return first four ints."""
    nums = []
    for x in df.iloc[row_idx, start_col_idx+1:].tolist():
        try:
            v = pd.to_numeric(str(x).strip(), errors="coerce")
        except Exception:
            v = pd.NA
        if pd.notna(v):
            # keep only whole numbers (these fields are all integers)
            if float(v).is_integer():
                nums.append(int(v))
        if len(nums) >= 4:  # Gas D, Gas H, Dist D, Dist H
            break
    while len(nums) < 4:
        nums.append(pd.NA)
    return nums  # [gd, gh, dd, dh]

def extract_colonial_data():
    ns = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
    inbox = ns.GetDefaultFolder(6)
    items = inbox.Items
    items.Sort("[ReceivedTime]", True)

    out_rows = []

    for msg in items:
        subj = (getattr(msg, "Subject", "") or "")
        if TARGET_SUBJ not in subj:
            continue

        html = (getattr(msg, "HTMLBody", "") or "")
        if not html:
            continue

        soup = _soup(html)
        text = soup.get_text("\n", strip=True)
        day = _date_from_text(text, msg.ReceivedTime.date())

        for t in soup.find_all("table"):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", FutureWarning)
                try:
                    dfs = pd.read_html(StringIO(str(t)))
                except Exception:
                    continue

            for df in dfs:
                if df.empty:
                    continue

                df = _promote_header(df)

                # locate indices of From/To/Cycle (by name contains)
                cols = [str(c).strip() for c in df.columns]
                low  = [c.lower() for c in cols]
                try: i_from  = next(i for i,c in enumerate(low) if c == "from")
                except StopIteration: i_from = None
                try: i_to    = next(i for i,c in enumerate(low) if c == "to")
                except StopIteration: i_to = None
                try: i_cycle = next((i for i,c in enumerate(low) if "cycle" in c), None)
                except StopIteration: i_cycle = None
                if i_from is None or i_to is None:
                    continue

                # clean codes
                df["From"] = df.iloc[:, i_from].map(_norm_code)
                df["To"]   = df.iloc[:, i_to].map(_norm_code)

                # pick the column to start scanning numbers from
                start_idx = i_cycle if i_cycle is not None else max(i_from, i_to)

                # filter HTN -> GBJ rows and extract numbers
                mask = (df["From"] == "HTN") & (df["To"] == "GBJ")
                for ridx in df.index[mask]:
                    gd, gh, dd, dh = _first_four_numbers_right_of(df, ridx, start_idx)

                    # cycle (integer if present)
                    cyc = pd.NA
                    if i_cycle is not None:
                        try:
                            cyc = pd.to_numeric(df.iat[ridx, i_cycle], errors="coerce")
                            if pd.notna(cyc) and float(cyc).is_integer():
                                cyc = int(cyc)
                        except Exception:
                            pass

                    out_rows.append({
                        "Date": day,
                        "From": "HTN",
                        "To": "GBJ",
                        "Cycle": cyc,
                        "Gas Days": gd, "Gas Hours": gh,
                        "Distillates Days": dd, "Distillates Hours": dh
                    })

    if not out_rows:
        return pd.DataFrame(columns=["Date","From","To","Cycle","Gas Days","Gas Hours","Distillates Days","Distillates Hours"])

    out = (pd.DataFrame(out_rows)
             .sort_values("Date", ascending=False)
             .reset_index(drop=True))
    return out

if __name__ == "__main__":
    df = extract_colonial_data()
    # df.to_excel("ColonialTransitTimes_Line3.xlsx", index=False)
    print(df.head(), f"\n[rows: {len(df)}]")


         Date From   To  Cycle  Gas Days  Gas Hours Distillates Days  \
0  2025-08-26  HTN  GBJ     48         8          3                9   
1  2025-08-26  HTN  GBJ     49         8         20               10   
2  2025-08-25  HTN  GBJ     48         8          1                9   
3  2025-08-24  HTN  GBJ     47         8         20                7   
4  2025-08-24  HTN  GBJ     48         8          1               10   

  Distillates Hours  
0                 1  
1                12  
2                13  
3                 8  
4                21   
[rows: 354]


In [1]:
#!/usr/bin/env python3
"""
Colonial Pipeline Transit Times Data Extractor

Extracts transit time data from Outlook emails containing Colonial Pipeline bulletins.
Supports configurable From/To location parameters.
"""

import re
import warnings
import pandas as pd
import win32com.client
from io import StringIO
from datetime import timedelta
from dateutil import parser
from bs4 import BeautifulSoup
from typing import Optional, Tuple, List, Dict, Any


class ColonialTransitExtractor:
    """Extracts Colonial Pipeline transit time data from Outlook emails."""
    
    def __init__(self, target_subject: str = "T4 Bulletin: Colonial - TRANSIT TIMES sent to sto_susan"):
        """
        Initialize the extractor.
        
        Args:
            target_subject: Email subject line to search for
        """
        self.target_subject = target_subject
    
    def _create_soup(self, html: str) -> BeautifulSoup:
        """Create BeautifulSoup object with fallback parsers."""
        try:
            return BeautifulSoup(html, "lxml")
        except:
            return BeautifulSoup(html, "html.parser")
    
    def _extract_date_from_text(self, text: str, fallback_date) -> Any:
        """Extract date from email text with fallback."""
        try:
            if "Date:" in text:
                date_part = text.split("Date:", 1)[1].split("\n", 1)[0]
                return parser.parse(date_part, fuzzy=True).date() - timedelta(days=1)
            return parser.parse(text, fuzzy=True).date() - timedelta(days=1)
        except Exception:
            return fallback_date - timedelta(days=1)
    
    def _promote_header_row(self, df: pd.DataFrame) -> pd.DataFrame:
        """Find and promote the row containing 'From' and 'To' as column headers."""
        for i in range(min(8, len(df))):
            values = [str(v).strip().lower() for v in df.iloc[i].values]
            if "from" in values and "to" in values:
                df.columns = df.iloc[i].astype(str).str.strip()
                return df.iloc[i+1:].reset_index(drop=True)
        return df
    
    def _normalize_location_code(self, code: str) -> str:
        """Normalize location code by keeping only uppercase letters."""
        return re.sub(r"[^A-Z]", "", str(code).upper())
    
    def _extract_first_four_numbers(self, df: pd.DataFrame, row_idx: int, start_col_idx: int) -> List[Any]:
        """
        Extract the first four numeric values from a row starting from start_col_idx+1.
        
        Returns:
            List of four values: [gas_days, gas_hours, distillates_days, distillates_hours]
        """
        numbers = []
        for value in df.iloc[row_idx, start_col_idx+1:].tolist():
            try:
                numeric_val = pd.to_numeric(str(value).strip(), errors="coerce")
            except Exception:
                numeric_val = pd.NA
            
            if pd.notna(numeric_val) and float(numeric_val).is_integer():
                numbers.append(int(numeric_val))
            
            if len(numbers) >= 4:  # Gas Days, Gas Hours, Distillates Days, Distillates Hours
                break
        
        # Pad with NA values if we don't have 4 numbers
        while len(numbers) < 4:
            numbers.append(pd.NA)
        
        return numbers
    
    def _find_column_indices(self, df: pd.DataFrame) -> Tuple[Optional[int], Optional[int], Optional[int]]:
        """Find column indices for From, To, and Cycle columns."""
        columns = [str(c).strip() for c in df.columns]
        lowercase_cols = [c.lower() for c in columns]
        
        try:
            from_idx = next(i for i, c in enumerate(lowercase_cols) if c == "from")
        except StopIteration:
            from_idx = None
        
        try:
            to_idx = next(i for i, c in enumerate(lowercase_cols) if c == "to")
        except StopIteration:
            to_idx = None
        
        try:
            cycle_idx = next((i for i, c in enumerate(lowercase_cols) if "cycle" in c), None)
        except StopIteration:
            cycle_idx = None
        
        return from_idx, to_idx, cycle_idx
    
    def _extract_cycle_value(self, df: pd.DataFrame, row_idx: int, cycle_col_idx: Optional[int]) -> Any:
        """Extract and validate cycle value from the specified column."""
        if cycle_col_idx is None:
            return pd.NA
        
        try:
            cycle_val = pd.to_numeric(df.iat[row_idx, cycle_col_idx], errors="coerce")
            if pd.notna(cycle_val) and float(cycle_val).is_integer():
                return int(cycle_val)
        except Exception:
            pass
        
        return pd.NA
    
    def extract_transit_data(self, from_location: str = "HTN", to_location: str = "GBJ") -> pd.DataFrame:
        """
        Extract Colonial Pipeline transit time data from Outlook emails.
        
        Args:
            from_location: Source location code (default: "HTN")
            to_location: Destination location code (default: "GBJ")
        
        Returns:
            DataFrame with columns: Date, From, To, Cycle, Gas Days, Gas Hours, 
                                  Distillates Days, Distillates Hours
        """
        # Normalize location codes
        from_code = self._normalize_location_code(from_location)
        to_code = self._normalize_location_code(to_location)
        
        # Connect to Outlook
        try:
            outlook = win32com.client.Dispatch("Outlook.Application")
            namespace = outlook.GetNamespace("MAPI")
            inbox = namespace.GetDefaultFolder(6)  # Inbox folder
            items = inbox.Items
            items.Sort("[ReceivedTime]", True)  # Sort by received time, descending
        except Exception as e:
            raise RuntimeError(f"Failed to connect to Outlook: {e}")
        
        extracted_data = []
        
        # Process each email
        for message in items:
            subject = getattr(message, "Subject", "") or ""
            if self.target_subject not in subject:
                continue
            
            html_body = getattr(message, "HTMLBody", "") or ""
            if not html_body:
                continue
            
            # Parse HTML and extract date
            soup = self._create_soup(html_body)
            text_content = soup.get_text("\n", strip=True)
            email_date = self._extract_date_from_text(text_content, message.ReceivedTime.date())
            
            # Process all tables in the email
            for table in soup.find_all("table"):
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", FutureWarning)
                    try:
                        dataframes = pd.read_html(StringIO(str(table)))
                    except Exception:
                        continue
                
                # Process each DataFrame from the table
                for df in dataframes:
                    if df.empty:
                        continue
                    
                    # Promote header row
                    df = self._promote_header_row(df)
                    
                    # Find column indices
                    from_idx, to_idx, cycle_idx = self._find_column_indices(df)
                    if from_idx is None or to_idx is None:
                        continue
                    
                    # Clean location codes
                    df["From"] = df.iloc[:, from_idx].map(self._normalize_location_code)
                    df["To"] = df.iloc[:, to_idx].map(self._normalize_location_code)
                    
                    # Determine starting column for number extraction
                    start_col_idx = cycle_idx if cycle_idx is not None else max(from_idx, to_idx)
                    
                    # Filter rows matching the specified route
                    route_mask = (df["From"] == from_code) & (df["To"] == to_code)
                    matching_rows = df.index[route_mask]
                    
                    # Extract data from matching rows
                    for row_idx in matching_rows:
                        gas_days, gas_hours, dist_days, dist_hours = self._extract_first_four_numbers(
                            df, row_idx, start_col_idx
                        )
                        
                        cycle_value = self._extract_cycle_value(df, row_idx, cycle_idx)
                        
                        extracted_data.append({
                            "Date": email_date,
                            "From": from_code,
                            "To": to_code,
                            "Cycle": cycle_value,
                            "Gas Days": gas_days,
                            "Gas Hours": gas_hours,
                            "Distillates Days": dist_days,
                            "Distillates Hours": dist_hours
                        })
        
        # Create final DataFrame
        if not extracted_data:
            return pd.DataFrame(columns=[
                "Date", "From", "To", "Cycle", "Gas Days", "Gas Hours", 
                "Distillates Days", "Distillates Hours"
            ])
        
        result_df = (pd.DataFrame(extracted_data)
                    .sort_values("Date", ascending=False)
                    .reset_index(drop=True))
        
        return result_df


def extract_colonial_transit_times(from_location: str = "HTN", 
                                 to_location: str = "GBJ",
                                 target_subject: str = "T4 Bulletin: Colonial - TRANSIT TIMES sent to sto_susan") -> pd.DataFrame:
    """
    Convenience function to extract Colonial Pipeline transit time data.
    
    Args:
        from_location: Source location code (default: "HTN")
        to_location: Destination location code (default: "GBJ")
        target_subject: Email subject line to search for
    
    Returns:
        DataFrame with transit time data
    """
    extractor = ColonialTransitExtractor(target_subject)
    return extractor.extract_transit_data(from_location, to_location)


def main():
    """Main function for command-line usage."""
    # Example usage with default parameters (HTN -> GBJ)
    print("Extracting Colonial Pipeline transit times (HTN -> GBJ)...")
    df = extract_colonial_transit_times()
    
    if not df.empty:
        print(f"\nExtracted {len(df)} records:")
        print(df.head())
        
        # Optionally save to Excel
        # df.to_excel("ColonialTransitTimes_HTN_to_GBJ.xlsx", index=False)
        # print("\nData saved to ColonialTransitTimes_HTN_to_GBJ.xlsx")
    else:
        print("No data found matching the criteria.")
    

if __name__ == "__main__":
    main()

Extracting Colonial Pipeline transit times (HTN -> GBJ)...

Extracted 354 records:
         Date From   To  Cycle  Gas Days  Gas Hours Distillates Days  \
0  2025-08-26  HTN  GBJ     48         8          3                9   
1  2025-08-26  HTN  GBJ     49         8         20               10   
2  2025-08-25  HTN  GBJ     48         8          1                9   
3  2025-08-24  HTN  GBJ     47         8         20                7   
4  2025-08-24  HTN  GBJ     48         8          1               10   

  Distillates Hours  
0                 1  
1                12  
2                13  
3                 8  
4                21  

Example with custom locations (you can modify these):
df = extract_colonial_transit_times('ABC', 'XYZ')
