In [6]:
import os
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
import nltk
from tqdm import tqdm
import subprocess

In [11]:
nltk.download('punkt', quiet=True)

RAW_CSV = "../../data/raw_speeches.csv"
CACHE_FILE = "../../data/incremental_cache.json"
FED_SCRAPER_DIR = "../../Fed-Scraper-main/fed_scraper"
FED_SCRAPER_MAIN_DIR = "../../Fed-Scraper-main/"
DOCUMENTS_DIR = os.path.join(FED_SCRAPER_MAIN_DIR, "data", "documents_by_type")

CURRENT_DIR = Path.cwd() 
FED_SCRAPER_ROOT_ABS = (CURRENT_DIR / FED_SCRAPER_DIR).resolve()
print(FED_SCRAPER_ROOT_ABS)
FED_SCRAPER_MAIN_DIR_ROOT = (CURRENT_DIR / FED_SCRAPER_MAIN_DIR).resolve()
print(FED_SCRAPER_MAIN_DIR_ROOT)
RAW_CSV_ABS = (CURRENT_DIR / RAW_CSV).resolve()
CACHE_FILE_ABS = (CURRENT_DIR / CACHE_FILE).resolve()

INCLUDED_KINDS = {
    "fomc", "minutes", "press_conferences", "speeches",
    "transcripts", "projections", "beige_book", "redbooks", "teal_book"
}

class FedSpeechFetcher:
    def __init__(self):
        if not os.path.exists("data"):
            os.makedirs("data")
    
    def run_spiders(self):
        """
        Run all spiders (full scraping on first execution)
        """
        spiders = [
            "beige_book_archive",
            "beige_book_current",
            "fomc_calendar",
            "historical_materials"
        ]

        for spider in spiders:
            #subprocess.run(
            #    ["scrapy", "crawl", spider],
            #    cwd=FED_SCRAPER_DIR,
            #    check=True)
            # Capturing output is the key change!
            scrapy_cmd = str(FED_SCRAPER_ROOT_ABS)
            print(scrapy_cmd)
            result = subprocess.run(
                ["scrapy", "crawl", spider],
                cwd=scrapy_cmd,
                capture_output=True,  # Capture stdout and stderr
                text=True,            # Decode output as text
                check=False           # Temporarily set to False so it doesn't crash Python
            )
            
            # If the process returned an error code, print the full traceback from Scrapy
            if result.returncode != 0:
                print(f"ERROR: Scrapy spider '{spider}' failed with exit status {result.returncode}")
                print("\n--- Scrapy STDOUT ---")
                print(result.stdout)
                print("\n--- Scrapy STDERR (Actual Error Message) ---")
                print(result.stderr)
                
                # Now raise the error manually so the calling function knows it failed
                result.check_returncode()

    def load_fed_scraper_outputs(self):
        frames = []
        DOCUMENTS_DIR_ABS = FED_SCRAPER_MAIN_DIR_ROOT / "data" / "documents_by_type"
        print("Documents dir exists:", DOCUMENTS_DIR_ABS ,DOCUMENTS_DIR_ABS.exists())
        if not DOCUMENTS_DIR_ABS.exists():
            raise FileNotFoundError(
                f"Fed-Scraper output directory not found: {DOCUMENTS_DIR_ABS}\n"
                "Make sure spiders completed successfully."
            )
            
        for fname in DOCUMENTS_DIR_ABS.iterdir():
            if not fname.name.endswith(".csv"):
                continue

            kind = fname.stem

            if kind not in INCLUDED_KINDS:
                continue

            df = pd.read_csv(fname)
            df["document_kind"] = kind
            frames.append(df)

        if not frames:
            return pd.DataFrame()

        return pd.concat(frames, ignore_index=True)
    
    def load_cache(self):
        if not os.path.exists(CACHE_FILE_ABS):
            return {}
        with open(CACHE_FILE_ABS, "r") as f:
            return json.load(f)
        
    def save_cache(self, cache):
        Path(CACHE_FILE_ABS).parent.mkdir(parents=True, exist_ok=True) 
        with open(CACHE_FILE_ABS, "w") as f:
            json.dump(cache, f, indent=2)
            
    def append_new_speeches(self, df):
        """
        write new rows to data/raw_speeches.csv
        """
        cache = self.load_cache()

        new_rows = []

        for _, row in df.iterrows():
            uid = f"{row['document_kind']}_{row['url']}"

            release = row.get("release_date", None)
            if isinstance(release, float):
                release = None

            if uid not in cache:
                new_rows.append(row)
                cache[uid] = release
            else:
                if release and release != cache[uid]:
                    new_rows.append(row)
                    cache[uid] = release

        # append to disk
        if new_rows:
            new_df = pd.DataFrame(new_rows)
            Path(RAW_CSV_ABS).parent.mkdir(parents=True, exist_ok=True) # Ensure parent directory exists

            if not os.path.exists(RAW_CSV_ABS):
                new_df.to_csv(RAW_CSV_ABS, index=False)
            else:
                old = pd.read_csv(RAW_CSV_ABS)
                full = pd.concat([old, new_df], ignore_index=True)
                full.to_csv(RAW_CSV_ABS, index=False)

        self.save_cache(cache)

    def run_full(self):
        self.run_spiders()
        df = self.load_fed_scraper_outputs()
        self.append_new_speeches(df)
        return df

    def run_incremental(self):
        df = self.load_fed_scraper_outputs()
        self.append_new_speeches(df)
        return df
    
if __name__ == "__main__":
    fetcher = FedSpeechFetcher()

if not os.path.exists(RAW_CSV):
    print("Performing full scrape...")
    fetcher.run_full()
else:
    print("Performing incremental scrape...")
    fetcher.run_incremental()

C:\Users\windows\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\Programming\Project\Price-Prediction-AI\Fed-Scraper-main\fed_scraper
C:\Users\windows\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\Programming\Project\Price-Prediction-AI\Fed-Scraper-main
Performing incremental scrape...
Documents dir exists: C:\Users\windows\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\Programming\Project\Price-Prediction-AI\Fed-Scraper-main\data\documents_by_type True


In [None]:
import os
import re
import math
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta

import nltk
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

# optional transformer backend
try:
    from sentence_transformers import SentenceTransformer
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False


In [None]:

# ---------------------------
# Paths
# ---------------------------
RAW_CSV = Path("../../Fed-Scraper-main/data/fomc_documents.csv").resolve()
OUT_DIR = Path("../../data/processed").resolve()
DAILY_DIR = OUT_DIR / "daily_embeddings"

OUT_DIR.mkdir(parents=True, exist_ok=True)
DAILY_DIR.mkdir(parents=True, exist_ok=True)


# ---------------------------
# Document priors
# ---------------------------
DOC_PRIORS = {
    "fomc":              {"weight": 1.00, "half_life": 10},
    "minutes":           {"weight": 0.95, "half_life": 12},
    "press_conferences": {"weight": 0.90, "half_life": 7},
    "transcripts":       {"weight": 0.85, "half_life": 20},
    "projections":       {"weight": 1.00, "half_life": 30},
    "speeches":          {"weight": 0.60, "half_life": 5},
    "beige_book":        {"weight": 0.45, "half_life": 4},
    "redbooks":          {"weight": 0.35, "half_life": 3},
    "teal_book":         {"weight": 0.50, "half_life": 15},
}


# ---------------------------
# Text utilities
# ---------------------------
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"http\S+", "", text)
    return text.strip().lower()


def half_life_from_text(text: str, base: float) -> float:
    keywords = [
        "uncertainty", "outlook", "forecast", "risks",
        "expected", "projected", "anticipate"
    ]
    boost = sum(1 for k in keywords if k in text)
    return base * (1 + 0.15 * boost)


def decay_weight(days: int, half_life: float) -> float:
    return math.exp(-math.log(2) * days / half_life)


# ---------------------------
# Builder
# ---------------------------
class SpeechDecayBuilder:
    def __init__(self):
        self.df = pd.read_csv(RAW_CSV)

        self.df["release_date"] = pd.to_datetime(
            self.df["release_date"], errors="coerce"
        )
        self.df = self.df.dropna(subset=["release_date"])

        self.df["clean_text"] = self.df["text"].apply(clean_text)

        if TRANSFORMERS_AVAILABLE:
            self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
            self.embed_dim = self.embedder.get_sentence_embedding_dimension()
            self.backend = "sentence-transformers"
        else:
            self.embedder = TfidfVectorizer(
                max_features=512,
                stop_words="english"
            )
            self.backend = "tfidf"

    # -----------------------
    # Embeddings
    # -----------------------
    def build_embeddings(self):
        texts = self.df["clean_text"].tolist()

        if TRANSFORMERS_AVAILABLE:
            embeddings = self.embedder.encode(
                texts, batch_size=32, show_progress_bar=True
            )
        else:
            embeddings = self.embedder.fit_transform(texts).toarray()

        np.save(OUT_DIR / "embeddings.npy", embeddings)

        self.df["doc_index"] = np.arange(len(self.df))
        self.df.to_csv(OUT_DIR / "speech_metadata.csv", index=False)

        return embeddings

    # -----------------------
    # Daily aggregation
    # -----------------------
    def build_daily_embeddings(self, embeddings: np.ndarray):
        start = self.df["release_date"].min().date()
        end = datetime.utcnow().date()

        for day in pd.date_range(start, end):
            vec = np.zeros(embeddings.shape[1])

            for _, row in self.df.iterrows():
                if row["release_date"].date() > day.date():
                    continue

                days = (day.date() - row["release_date"].date()).days
                prior = DOC_PRIORS.get(row["document_kind"], None)
                if prior is None:
                    continue

                hl = half_life_from_text(
                    row["clean_text"], prior["half_life"]
                )
                w = prior["weight"] * decay_weight(days, hl)
                vec += w * embeddings[int(row["doc_index"])]

            if np.linalg.norm(vec) > 0:
                vec /= np.linalg.norm(vec)

            out = DAILY_DIR / f"{day.date()}_embeddings.npz"
            np.savez_compressed(
                out,
                embedding=vec,
                date=str(day.date()),
                backend=self.backend
            )

    # -----------------------
    # Full run
    # -----------------------
    def run(self):
        embeddings = self.build_embeddings()
        self.build_daily_embeddings(embeddings)


# ---------------------------
# CLI
# ---------------------------
if __name__ == "__main__":
    builder = SpeechDecayBuilder()
    builder.run()
