<a href="https://colab.research.google.com/github/shravanxd/hedgeletter-pipeline/blob/main/hedge_fund_letters_metadata_extracter_summariser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this cell to install all required Python libraries used in the main script.
!pip install pdfplumber yfinance openai pandas

In [None]:
# I’ve tried to keep this Colab setup flexible.
# You can pull the Eriksen letters from:
#   (a) Google Drive, or
#   (b) GitHub, or
#   (c) Direct upload into Colab

# Option A — Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# folder = "/content/drive/MyDrive/eriksen"

# Option B — GitHub
!git clone https://github.com/shravanxd/hedgeletter-pipeline.git
folder = "/content/hedgeletter-pipeline/eriksen"

# Option C — Direct upload to Colab
# folder = "/content/eriksen"


In [None]:
import os
import re
import datetime as dt
import pandas as pd
import yfinance as yf
import pdfplumber
from openai import OpenAI
import json
from getpass import getpass
from google.colab import drive

# Replace "xxx" with your OpenAI API key to access OpenAI models.
# You can use the key I shared (with $5 credit) or generate your own at:
# https://platform.openai.com/api-keys
# Secure input prompt (not visible while typing)
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

client = OpenAI()


def extract_text_from_pdf(path):
    """
    Extract raw text from a PDF file and return a single text string.
    """
    text_blocks = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_blocks.append(page_text)
    return "\n".join(text_blocks)


def detect_hedge_clause(text):
    """
    Identify and separate hedge/legal disclaimer text from the end of a letter.
    Returns (hedge_clause, main_text).
    If no clause is detected, hedge_clause is None and full text is returned as main.
    """

    keywords = [
        "disclaimer", "hedge clause", "forward-looking",
        "not investment advice", "no guarantee",
        "risk", "past performance", "material adverse"
    ]

    raw = text.lower()
    last_found = -1
    for kw in keywords:
        pos = raw.rfind(kw)
        if pos > last_found:
            last_found = pos

    if last_found == -1:
        return None, text

    hedge = text[last_found:].strip()
    main = text[:last_found].strip()
    return hedge, main


def call_llm_extract(text):
    """
    Request structured metadata from the LLM.
    The model must return a valid JSON object only.
    The goal is accuracy and reliable field extraction, prioritizing explicit quarter mentions.
    """

    prompt = f"""
Extract structured metadata from the hedge fund letter text below.
Only return valid JSON. Do not include commentary.

Required fields:
- fund_name: string or null
- manager_name: string or null
- letter_date: date string if found (any format)
- quarterly_performance_pct: number if found (fund quarterly return), else null
- sp500_quarter_pct: number or null
- russell2000_quarter_pct: number or null
- summary: brief objective summary of the letter
- key_themes_holdings: important themes, ideas, or holdings
- sentiment: overall qualitative sentiment (positive, negative, neutral)

Instructions:
- Extract only information that is actually present. Do not fabricate values.
- If any field is missing in the text, set it to null.
- Provide only the JSON object.

Text:
{text}
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        response_format={"type": "json_object"},
        messages=[{"role": "user", "content": prompt}]
    )

    return json.loads(response.choices[0].message.content)


def guess_year_from_date(date_str):
    """
    Convert a date string to a datetime object if possible and return (year, datetime).
    """
    try:
        parsed = pd.to_datetime(date_str)
        return parsed.year, parsed
    except:
        return None, None


def normalize_quarter(dt_obj):
    """
    Derive the quarter label from a datetime object.
    """
    if dt_obj is None:
        return None
    m = dt_obj.month
    if m <= 3:
        return "Q1"
    elif m <= 6:
        return "Q2"
    elif m <= 9:
        return "Q3"
    return "Q4"

def detect_quarter_text(text):
    """Detect quarter label from raw text using more robust keyword patterns, including regex."""
    t = text.lower()

    patterns = {
        "Q1": [r"q1[^0-9]", r"first quarter", r"1st quarter", r"q-1"],
        "Q2": [r"q2[^0-9]", r"second quarter", r"2nd quarter", r"q-2"],
        "Q3": [r"q3[^0-9]", r"third quarter", r"3rd quarter", r"q-3"],
        "Q4": [r"q4[^0-9]", r"fourth quarter", r"4th quarter", r"q-4", r"q4\s*[’']?\s*24", r"fourth quarter\s+2024"]
    }

    for q, keys in patterns.items():
        for k in keys:
            if re.search(k, t):
                return q.upper()
    return None

def extract_quarter_year_from_subject(text):
    """
    Extract quarter and year from the subject line of the letter.
    Prioritizes explicit quarter mentions. Defaults to Q4 if only year and "results" are found.
    """
    # Extract the subject line
    pat = r"Subject:\s*(.*)"
    m = re.search(pat, text, re.IGNORECASE)
    if not m:
        return None, None

    subj = m.group(1).lower()

    # Determine quarter
    q = None
    if re.search(r"first quarter|1st quarter", subj):
        q = "Q1"
    elif re.search(r"second quarter|2nd quarter", subj):
        q = "Q2"
    elif re.search(r"third quarter|3rd quarter", subj):
        q = "Q3"
    elif re.search(r"fourth quarter|4th quarter|year end|results", subj): # Include "year end" and "results" for Q4
        q = "Q4"

    # Extract year from subject
    year = None
    year_m = re.search(r"(20\d{2})", subj)
    if year_m:
        year = int(year_m.group(1))

    return q, year


def quarter_bounds(year, quarter_label):
    """
    Return start and end dates for a calendar year + quarter label.
    """
    if year is None or quarter_label is None:
        return None, None

    y = int(year)

    if quarter_label == "Q1":
        return dt.date(y, 1, 1), dt.date(y, 3, 31)
    if quarter_label == "Q2":
        return dt.date(y, 4, 1), dt.date(y, 6, 30)
    if quarter_label == "Q3":
        return dt.date(y, 7, 1), dt.date(y, 9, 30)
    if quarter_label == "Q4":
        return dt.date(y, 10, 1), dt.date(y, 12, 31)

    return None, None


def qtr_return_yf(ticker, start, end):
    """
    Measure quarterly price return using yfinance.
    Returns percent return or None.
    """
    try:
        px = yf.download(
            ticker,
            start=start,
            end=end + dt.timedelta(days=1),
            progress=False,
            auto_adjust=True
        )
        if px is None or px.empty:
            return None

        s = px["Close"].dropna()
        if len(s) < 2:
            return None

        first = float(s.iloc[0])
        last = float(s.iloc[-1])
        return (last / first - 1.0) * 100.0

    except:
        return None


def remove_emojis(text):
    """
    Remove emojis from text.
    """
    if text is None:
        return None

    emoji_pattern = re.compile(
        r"["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )

    return emoji_pattern.sub("", text)


def process_folder(pdf_folder):
    """
    Process all PDF letters in a folder and build a structured dataframe.
    Performs:
    - PDF text extraction
    - Hedge clause separation
    - Metadata extraction via LLM
    - Quarterly index return collection
    - Alpha calculation
    - CSV export
    """

    rows = []

    for filename in os.listdir(pdf_folder):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(pdf_folder, filename)
        print(f"Processing: {pdf_path}")

        raw = extract_text_from_pdf(pdf_path)
        raw = remove_emojis(raw)

        hedge_clause, main_text = detect_hedge_clause(raw)

        llm_data = call_llm_extract(main_text)

        fund_name = llm_data.get("fund_name")
        manager = llm_data.get("manager_name")
        date_str = llm_data.get("letter_date")

        fund_q = llm_data.get("quarterly_performance_pct")
        sp_q = llm_data.get("sp500_quarter_pct")
        r2k_q = llm_data.get("russell2000_quarter_pct")

        summary = llm_data.get("summary")
        holdings = llm_data.get("key_themes_holdings")
        sentiment = llm_data.get("sentiment")

        # Determine quarter and year with priority: Subject > Letter Date > Filename
        q_label, year = extract_quarter_year_from_subject(raw)

        if q_label is None or year is None:
            # Fallback to letter date if subject parsing failed
            year_from_date, dt_obj = guess_year_from_date(date_str)
            if year_from_date is not None:
                year = year_from_date
            if q_label is None and dt_obj is not None:
                q_label = normalize_quarter(dt_obj)

        if q_label is None or year is None:
            # Fallback to filename if letter date parsing failed
            filename_lower = filename.lower()
            if "q1" in filename_lower:
                q_label = "Q1"
            elif "q2" in filename_lower:
                q_label = "Q2"
            elif "q3" in filename_lower:
                q_label = "Q3"
            elif "q4" in filename_lower:
                q_label = "Q4"

            year_match_filename = re.search(r"(20\d{2})", filename_lower)
            if year is None and year_match_filename:
                year = int(year_match_filename.group(1))


        if year is not None and q_label is not None:
            start, end = quarter_bounds(year, q_label)

            if sp_q is None:
                sp_q = qtr_return_yf("^GSPC", start, end) or qtr_return_yf("SPY", start, end)
            if r2k_q is None:
                r2k_q = qtr_return_yf("^RUT", start, end) or qtr_return_yf("IWM", start, end)

        alpha_spx = None
        alpha_r2k = None

        try:
            if fund_q is not None and sp_q is not None:
                alpha_spx = float(fund_q) - float(sp_q)
            if fund_q is not None and r2k_q is not None:
                alpha_r2k = float(fund_q) - float(r2k_q)
        except:
            pass

        rows.append([
            filename,
            fund_name,
            manager,
            date_str,
            q_label,
            year,
            fund_q,
            sp_q,
            r2k_q,
            alpha_spx,
            alpha_r2k,
            summary,
            holdings,
            sentiment,
            hedge_clause,
            main_text
        ])

    cols = [
        "filename",
        "fund_name",
        "manager_name",
        "letter_date",
        "quarter_label",
        "year",
        "quarterly_performance_pct",
        "sp500_quarter_return_pct",
        "russell2000_quarter_return_pct",
        "alpha_vs_sp500_pct",
        "alpha_vs_russell2000_pct",
        "summary",
        "key_themes_holdings",
        "sentiment",
        "hedge_clause",
        "clean_text"
    ]

    df = pd.DataFrame(rows, columns=cols)
    df.to_csv("letters_processed.csv", index=False)
    print("Completed: letters_processed.csv generated")

    return df


if __name__ == "__main__":
    process_folder(folder)