In [1]:
import os
os.environ["OPENAI_API_KEY"] = "MY API KEY"

In [3]:
import csv
import json
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [5]:
HEADERS = {"User-Agent": "Rafael Trotter ra494491@ucf.edu"}

In [7]:
INPUT_DIR = "8K_Filings"
OUTPUT_CSV = "8-k_newProducts.csv"

In [9]:
import requests
from bs4 import BeautifulSoup

In [15]:
# Acquire ticker → name/cik mapping
def load_ticker_data():
    url = "https://www.sec.gov/files/company_tickers.json"
    response = requests.get(url, headers=HEADERS)
    return {
        v["ticker"].upper(): {
            "cik": str(v["cik_str"]).zfill(10),
            "title": v["title"]
        } for v in response.json().values()
    }

# Store ticker data 
TICKER_DATA = load_ticker_data()

# Extract clean, relevant text from 8-K file
def extract_clean_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_html = f.read()

    # starting marker
    start_marker = "Copyright 2024 Workiva"
    if start_marker in raw_html:
        raw_html = raw_html.split(start_marker, 1)[-1]

    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text(separator=" ", strip=True)

# Activate LLM
def extract_product_info(text):
    prompt = f"""
You are an expert in SEC filings. Analyze, and search the following 8-K text and identify any new product announcements.

Return ONLY JSON in this format:
{{
  "new_product": "Product Name",
  "product_description": "Brief explanation of the product"
}}

If no new product is mentioned, return:
{{
  "new_product": null,
  "product_description": null
}}

Here is the 8-K text:
{text[:10000]}
"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )
    return response.choices[0].message.content

# look for ticker and date from filename
def parse_filename(filename):
    parts = filename.split("_")
    stock = parts[0]
    date = parts[-1].replace(".txt", "")
    return stock.upper(), date

# Main 
def process_all_files():
    results = []
    txt_files = sorted([file for file in os.listdir(INPUT_DIR) if file.endswith(".txt")])[:]

    for file in txt_files:
        file_path = os.path.join(INPUT_DIR, file)
        print(f" Processing: {file}")

        try:
            clean_text = extract_clean_text(file_path)
            json_text = extract_product_info(clean_text)
            response_json = json.loads(json_text)

            stock, filing_date = parse_filename(file)
            company_info = TICKER_DATA.get(stock, {"title": "Unknown", "cik": "N/A"})

            new_product = response_json.get("new_product")
            product_description = response_json.get("product_description")

            results.append({
                "company_name": company_info["title"],
                "stock_name": stock,
                "filing_time": filing_date,
                "new_product": new_product if new_product else "not mentioned",
                "product_description": product_description if product_description else "not mentioned"
            })

        except Exception as e:
            print(f" Error processing {file}: {e}")
            stock, filing_date = parse_filename(file)
            company_info = TICKER_DATA.get(stock, {"title": "Unknown", "cik": "N/A"})
            results.append({
                "company_name": company_info["title"],
                "stock_name": stock,
                "filing_time": filing_date,
                "new_product": "not mentioned",
                "product_description": "not mentioned"
            })

    # Save output to CSV
    with open(OUTPUT_CSV, "w", newline='', encoding="utf-8") as csvfile:
        fieldnames = ["company_name", "stock_name", "filing_time", "new_product", "product_description"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in results:
            writer.writerow(row)

    print(f"\n Done! Processed {len(results)} filings. Saved to {OUTPUT_CSV}.")

# Run the script
if __name__ == "__main__":
    process_all_files()

 Processing: AAPL_8K_10_2024-05-02.txt
 Processing: AAPL_8K_1_2025-02-25.txt
 Processing: AAPL_8K_2_2025-01-30.txt
 Processing: AAPL_8K_3_2025-01-03.txt
 Processing: AAPL_8K_4_2024-10-31.txt
 Processing: AAPL_8K_5_2024-09-10.txt
 Processing: AAPL_8K_6_2024-08-26.txt
 Processing: AAPL_8K_7_2024-08-23.txt
 Processing: AAPL_8K_8_2024-08-01.txt
 Processing: AAPL_8K_9_2024-05-03.txt
 Processing: AMZN_8K_10_2023-11-01.txt
 Processing: AMZN_8K_1_2025-02-06.txt
 Processing: AMZN_8K_2_2024-10-31.txt
 Error processing AMZN_8K_2_2024-10-31.txt: Extra data: line 4 column 2 (char 103)
 Processing: AMZN_8K_3_2024-08-01.txt
 Processing: AMZN_8K_4_2024-05-24.txt
 Processing: AMZN_8K_5_2024-05-14.txt
 Error processing AMZN_8K_5_2024-05-14.txt: The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: expected name token at '<![(V#1(2\\Y32.RXI/Y('
 Processing: AMZN_8K_6_2024-05-03.txt
 Processing: 