In [1]:
import pandas as pd
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www\S+", "", t)
    t = re.sub(r"[^a-z0-9$€%\.\-/ ]", " ", t)
    return re.sub(r"\s+", " ", t).strip()

df = pd.read_csv("data/raw/fiqa_dataset.csv", encoding="utf-8")

df["text"] = df["question"].astype(str) + " " + df["answer"].astype(str)
df["clean_text"] = df["text"].apply(clean)

tokens, lemmas = [], []
for doc in nlp.pipe(df["clean_text"], batch_size=64):
    tokens.append(" ".join([t.text for t in doc]))
    lemmas.append(" ".join([t.lemma_ for t in doc]))

df["tokens"] = tokens
df["lemmas"] = lemmas

df.to_csv("data/processed/fiqa_spacy.csv", index=False)
print("FIQA processed!")


FIQA processed!


In [2]:
import pandas as pd
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean(t):
    t = str(t)
    t = re.sub(r"http\S+|www\S+", "", t)
    t = re.sub(r"[^A-Za-z0-9$€%\.\-/ ]", " ", t)
    return re.sub(r"\s+", " ", t).strip()

df = pd.read_csv("data/raw/financial_phrasebank/all-data.csv", encoding="latin1", header=None)
df.columns = ["label", "text"]

df["clean_text"] = df["text"].apply(clean)

tokens, lemmas = [], []
for doc in nlp.pipe(df["clean_text"], batch_size=64):
    tokens.append(" ".join([t.text for t in doc]))
    lemmas.append(" ".join([t.lemma_ for t in doc]))

df["tokens"] = tokens
df["lemmas"] = lemmas

df.to_csv("data/processed/phrasebank_spacy.csv", index=False)
print("PhraseBank processed!")


PhraseBank processed!


In [3]:
import pandas as pd
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean(t):
    t = str(t)
    t = re.sub(r"[^A-Za-z0-9\.\- ]", " ", t)
    return re.sub(r"\s+", " ", t).strip()

df = pd.read_csv("data/raw/yahoo_AAPL_stock_data.csv")

df["text"] = (
    "Open " + df["Open"].astype(str) +
    " High " + df["High"].astype(str) +
    " Low " + df["Low"].astype(str) +
    " Close " + df["Close"].astype(str)
)

df["clean_text"] = df["text"].apply(clean)

tokens, lemmas = [], []
for doc in nlp.pipe(df["clean_text"], batch_size=64):
    tokens.append(" ".join([t.text for t in doc]))
    lemmas.append(" ".join([t.lemma_ for t in doc]))

df["tokens"] = tokens
df["lemmas"] = lemmas

df.to_csv("data/processed/yahoo_spacy.csv", index=False)
print("Yahoo dataset processed!")


Yahoo dataset processed!


In [6]:
import os
import re
import pandas as pd
import spacy

# -----------------------------
# 1. Load spaCy safely
# -----------------------------
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000


# -----------------------------
# 2. Text cleaning function
# -----------------------------
def clean(text):
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# -----------------------------
# 3. Chunking function (FIXED)
# -----------------------------
def chunk_text(text, chunk_size=1500):
    """
    Split text into word-based chunks
    to avoid spaCy max_length errors
    """
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])


# -----------------------------
# 4. File paths
# -----------------------------
input_file = "data/raw/sec_statement_q2_2025/sub.txt"
output_folder = "data/processed/sec"
os.makedirs(output_folder, exist_ok=True)


# -----------------------------
# 5. Read & clean text
# -----------------------------
with open(input_file, "r", encoding="utf-8", errors="ignore") as f:
    text = clean(f.read())


# -----------------------------
# 6. spaCy processing
# -----------------------------
rows = []

for chunk in chunk_text(text, chunk_size=1500):
    if not chunk.strip():
        continue

    doc = nlp(chunk)

    rows.append({
        "text": chunk,
        "tokens": " ".join(token.text for token in doc),
        "lemmas": " ".join(token.lemma_ for token in doc),
        "pos": " ".join(token.pos_ for token in doc)
    })


# -----------------------------
# 7. Save output
# -----------------------------
df = pd.DataFrame(rows)
output_path = f"{output_folder}/sec_sub_processed.csv"
df.to_csv(output_path, index=False)

print(f"✅ SEC processed safely → {len(df)} chunks saved to {output_path}")


✅ SEC processed safely → 213 chunks saved to data/processed/sec/sec_sub_processed.csv
