# Test the Environment

In [2]:
import sys
print(sys.executable)

/kellogg/software/envs/llm-test-env/bin/python


In [3]:
import torch
print(torch.cuda.is_available())

True


# Load and Clean 10-K documents

In [4]:
from datasets import Dataset
from pathlib import Path
import re

In [5]:
def clean_html(html):
    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()

In [6]:
def extract_mda(text):
    mda_text = None
    
    # obtain the second occurrence of "Discussion and Analysis of Financial Condition" with wildcards
    pattern = r"Discussion[\s,.-]*and[\s,.-]*Analysis[\s,.-]*of[\s,.-]*Financial[\s,.-]*Condition"
    mda_matches = list(re.finditer(pattern, text, re.IGNORECASE))
    if len(mda_matches) >= 2:
        m = mda_matches[1]
        mda_text = text[m.end():]
        return " ".join(mda_text.split()[:250])
    return mda_text

In [7]:
path = Path("/kellogg/data/EDGAR/10-K/2023")
docs = list(path.glob("*.txt"))
docs.sort()

data_dict = {'doc': [], 'text': []}
for d in docs[:10]:
    print(f"processing: {d.name}")
    text = clean_html(d.read_text())
    mda_text = extract_mda(text)
    data_dict['doc'].append(d.name)
    data_dict['text'].append(mda_text)

dataset_10k = Dataset.from_dict(data_dict)

processing: 1000045_2_0000950170-23-030037.txt
processing: 1000209_1_0000950170-23-007273.txt
processing: 1000228_1_0001000228-23-000011.txt
processing: 1000229_1_0000950170-23-002412.txt
processing: 1000230_4_0001437749-23-034978.txt
processing: 1000298_1_0001558370-23-004051.txt
processing: 1000623_1_0001000623-23-000021.txt
processing: 1000683_2_0001213900-23-030312.txt
processing: 1000694_1_0001000694-23-000005.txt
processing: 1000697_1_0001193125-23-050827.txt


# Load Summarization Pipeline

In [8]:
import os
os.environ['HF_HOME'] = "/projects/kellogg/.cache"

In [9]:
import huggingface_hub as hub

from dotenv import dotenv_values
config = dotenv_values("/home/wkt406/.env")
hub.login(token=config["HUGGINGFACE_TOKEN"])

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /tmp/wkt406-jupyter//xdg_cache_home/huggingface/token
Login successful


In [10]:
from transformers import pipeline

summarizer = pipeline("summarization", model="Falconsai/text_summarization")

# Process Dataset

In [11]:
dataset_10k = dataset_10k.map(
    lambda batch: {"summary": 
                   summarizer(batch['text'], 
                              max_length=50, 
                              min_length=30, 
                              do_sample=False,
                              truncation=True,
                             )
                  }, 
    batched=True,
)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [12]:
dataset_10k.to_dict()['text'][0]

'and Results of Operations.&#x201d; &#160; In addition, the spread of COVID-19 has caused us to modify our business practices (including restricting employee travel, developing social distancing plans for our employees and cancelling physical participation in meetings, events and conferences), and we may take further actions as may be required by government authorities or as we determine is in the best interests of our employees, partners and customers. The outbreak has adversely impacted and may further adversely impact our workforce and operations and the operations of our partners, customers, suppliers and third-party vendors, throughout the time period during which the spread of COVID-19 continues and related restrictions remain in place, and even after the COVID-19 outbreak has subsided. &#160; Even after the COVID-19 outbreak has subsided and despite the formal declaration of the end of the COVID-19 global health emergency by the World Health Organization in May 2023, our busines

In [13]:
dataset_10k.to_dict()['summary'][0]

{'summary_text': 'the spread of COVID-19 has caused us to modify our business practices . The outbreak has adversely impacted and may further adversely impact our workforce and operations and the operations of our partners, customers, suppliers and third-party vendors'}