In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import json
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

# Function to clean JSONL
def clean_jsonl(input_file, output_file):
    clean_data = []
    with open(input_file, "r") as f:
        for line in f:
            doc = json.loads(line)
            clean_data.append({
                "text": doc["text"],
                "labels": doc["labels"]
            })
    with open(output_file, "w") as f:
        for item in clean_data:
            f.write(json.dumps(item) + "\n")
    print(f"Cleaned file saved to {output_file}")

# Corrected file paths
fiqa_path = "/content/drive/MyDrive/Financial_Insight/fiqa_annotated.jsonl"  # space after fiqa
phrasebank_path = "/content/drive/MyDrive/Financial_Insight/phrasebank_annotated.jsonl"

# Clean files
clean_jsonl(fiqa_path, "/content/fiqa_clean.jsonl")
clean_jsonl(phrasebank_path, "/content/phrasebank_clean.jsonl")

# Combine into master JSONL
combined_data = []

for file in ["/content/fiqa_clean.jsonl", "/content/phrasebank_clean.jsonl"]:
    with open(file, "r") as f:
        for line in f:
            combined_data.append(json.loads(line))

# Save combined master JSONL
master_file = "/content/fiqa_phrasebank_master.jsonl"
with open(master_file, "w") as f:
    for item in combined_data:
        f.write(json.dumps(item) + "\n")

print(f"Combined master JSONL saved to {master_file}")
print(f"Total annotated paragraphs: {len(combined_data)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cleaned file saved to /content/fiqa_clean.jsonl
Cleaned file saved to /content/phrasebank_clean.jsonl
Combined master JSONL saved to /content/fiqa_phrasebank_master.jsonl
Total annotated paragraphs: 150


In [3]:
import json

# Load FIQA + Phrasebank combined JSONL
master_file = '/content/fiqa_phrasebank_master.jsonl'
data = []

with open(master_file, 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(f'Total paragraphs: {len(data)}')


Total paragraphs: 150


In [4]:
valid_labels = {
    'ORG', 'METRIC', 'VALUE', 'DATE', 'EVENT', 'ACCOUNT'
}
from collections import Counter

def validate_ner_dataset_span(data, valid_labels):
    errors = []
    label_counts = Counter()

    for idx, sample in enumerate(data):
        spans = sample.get('labels', [])
        text = sample.get("text", "")

        for span in spans:
            if len(span) != 3:
                errors.append(f"Malformed label at index {idx}: {span}")
                continue

            start, end, label = span

            if label not in valid_labels:
                errors.append(f"Invalid label '{label}' at index {idx}")

            if not (0 <= start < end <= len(text)):
                errors.append(f"Invalid span range at index {idx}: {span}")

            label_counts[label] += 1

    return errors, label_counts
errors, label_counts = validate_ner_dataset_span(data, valid_labels)

print("Validation errors:", errors[:10])
print("Label distribution:", label_counts)



Validation errors: []
Label distribution: Counter({'ORG': 345, 'VALUE': 332, 'DATE': 288, 'METRIC': 59, 'EVENT': 4})


In [5]:
from transformers import AutoTokenizer

model_name = "yiyanghkust/finbert-pretrain"
tokenizer = AutoTokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [6]:
# BIO label set (keep this cell ABOVE everything)
BIO_LABELS = {
    'O',
    'B-ORG','I-ORG',
    'B-VALUE','I-VALUE',
    'B-DATE','I-DATE',
    'B-METRIC','I-METRIC',
    'B-EVENT','I-EVENT',
    'B-ACCOUNT','I-ACCOUNT'
}

label_list = sorted(BIO_LABELS)
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(label2id)


{'B-ACCOUNT': 0, 'B-DATE': 1, 'B-EVENT': 2, 'B-METRIC': 3, 'B-ORG': 4, 'B-VALUE': 5, 'I-ACCOUNT': 6, 'I-DATE': 7, 'I-EVENT': 8, 'I-METRIC': 9, 'I-ORG': 10, 'I-VALUE': 11, 'O': 12}


In [7]:
def tokenize_and_align_labels_from_text(example, tokenizer, label2id):
    text = example["text"]
    spans = example["labels"]

    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    offset_mapping = encoding["offset_mapping"]
    labels = [-100] * len(offset_mapping)

    for start, end, label in spans:
        first_token = True
        for idx, (token_start, token_end) in enumerate(offset_mapping):
            if token_start == token_end == 0:
                continue
            if token_start < end and token_end > start:
                if first_token:
                    labels[idx] = label2id[f"B-{label}"]
                    first_token = False
                else:
                    labels[idx] = label2id[f"I-{label}"]

    encoding["labels"] = labels
    encoding.pop("offset_mapping")
    return encoding


In [12]:
from sklearn.model_selection import train_test_split

assert len(data) > 0, "Data not loaded!"

train_data, val_data = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    shuffle=True
)



In [13]:
train_ds = [tokenize_and_align_labels_from_text(ex, tokenizer, label2id)
            for ex in train_data]

val_ds   = [tokenize_and_align_labels_from_text(ex, tokenizer, label2id)
            for ex in val_data]


In [14]:
print(len(train_ds), len(val_ds))
print(train_ds[0].keys())


120 30
KeysView({'input_ids': [3, 4683, 14477, 21087, 4364, 2537, 615, 214, 21, 40, 1872, 19, 2180, 26, 21087, 4642, 44, 5674, 1276, 1483, 223, 41, 5674, 58, 797, 115, 4683, 14477, 4642, 60, 96, 585, 15, 5674, 58, 419, 185, 78, 2040, 85, 6, 939, 7, 2267, 491, 725, 11, 1025, 126, 582, 14, 23293, 333, 8, 2614, 765, 71, 30, 14, 2537, 543, 48, 796, 2040, 17, 15, 1043, 412, 635, 10, 6, 39, 228, 333, 2267, 63, 585, 2537, 615, 193, 585, 12440, 765, 29, 9, 2149, 5565, 14, 1214, 582, 379, 7334, 33, 440, 48, 26, 5272, 9, 2614, 8, 23293, 71, 262, 30, 1819, 9, 2537, 543, 48, 41, 5674, 58, 191, 14, 11, 2149, 9, 25, 10716, 333, 60, 6, 2267, 49, 4929, 139, 73, 4], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
sample = train_ds[0]

tokens = tokenizer.convert_ids_to_tokens(sample["input_ids"])
labels = [id2label[l] if l != -100 else "IGN" for l in sample["labels"]]

for t, l in zip(tokens, labels):
    print(f"{t:15} {l}")


[CLS]           IGN
van             IGN
##guard         IGN
etf             IGN
vs              IGN
mutual          IGN
fund            IGN
where           IGN
are             IGN
you             IGN
planning        IGN
on              IGN
buying          IGN
this            IGN
etf             IGN
?               IGN
i               IGN
'               IGN
m               IGN
guess           IGN
##ing           IGN
it              IGN
'               IGN
s               IGN
directly        IGN
through         IGN
van             B-ORG
##guard         I-ORG
?               IGN
if              IGN
so              IGN
,               IGN
that            IGN
'               IGN
s               IGN
likely          IGN
your            IGN
first           IGN
reason          IGN
-               IGN
the             IGN
majority        IGN
of              IGN
brokerage       IGN
accounts        IGN
charge          IGN
a               IGN
commission      IGN
per             IGN
trade           

In [16]:
from datasets import Dataset

train_dataset = Dataset.from_list(train_ds)
val_dataset   = Dataset.from_list(val_ds)

print(train_dataset)
print(val_dataset)


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 120
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 30
})


In [17]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "yiyanghkust/finbert-pretrain",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

print(model.config.num_labels)




pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


13


In [18]:
!pip install seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=94353e60ce8151c274f176544c35f9344f65dc71451adff5608a8f741a3bee0d
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [19]:
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score


In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_labels = []
    true_preds = []

    for pred, lab in zip(predictions, labels):
        curr_labels = []
        curr_preds = []
        for p, l in zip(pred, lab):
            if l != -100:
                curr_labels.append(id2label[l])
                curr_preds.append(id2label[p])
        true_labels.append(curr_labels)
        true_preds.append(curr_preds)

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds)
    }


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./finbert_ner",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=1,
    report_to="none"
)




In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [23]:
trainer.train()




Step,Training Loss
10,2.2146
20,1.5402
30,1.1488
40,0.885
50,0.6701
60,0.5711
70,0.4994


TrainOutput(global_step=75, training_loss=1.0384545040130615, metrics={'train_runtime': 1113.073, 'train_samples_per_second': 0.539, 'train_steps_per_second': 0.067, 'total_flos': 39198411417600.0, 'train_loss': 1.0384545040130615, 'epoch': 5.0})

In [24]:
trainer.evaluate()




{'eval_loss': 0.6048870086669922,
 'eval_precision': 0.6853146853146853,
 'eval_recall': 0.7205882352941176,
 'eval_f1': 0.7025089605734767,
 'eval_runtime': 13.5692,
 'eval_samples_per_second': 2.211,
 'eval_steps_per_second': 0.295,
 'epoch': 5.0}

In [25]:
from seqeval.metrics import classification_report
import numpy as np

def entity_wise_report(trainer, dataset):
    preds, labels, _ = trainer.predict(dataset)
    preds = np.argmax(preds, axis=-1)

    true_labels = []
    true_preds = []

    for p, l in zip(preds, labels):
        curr_labels = []
        curr_preds = []
        for pi, li in zip(p, l):
            if li != -100:
                curr_labels.append(id2label[li])
                curr_preds.append(id2label[pi])
        true_labels.append(curr_labels)
        true_preds.append(curr_preds)

    print(classification_report(true_labels, true_preds))


In [26]:
entity_wise_report(trainer, val_dataset)


              precision    recall  f1-score   support

        DATE       0.75      0.86      0.80        28
      METRIC       1.00      0.56      0.71         9
         ORG       0.70      0.73      0.71        55
       VALUE       0.59      0.66      0.62        44

   micro avg       0.69      0.72      0.70       136
   macro avg       0.76      0.70      0.71       136
weighted avg       0.70      0.72      0.70       136



In [27]:
def show_errors(trainer, dataset, n=10):
    preds, labels, _ = trainer.predict(dataset)
    preds = np.argmax(preds, axis=-1)

    for i in range(len(dataset)):
        tokens = tokenizer.convert_ids_to_tokens(dataset[i]["input_ids"])
        for t, p, l in zip(tokens, preds[i], labels[i]):
            if l != -100 and p != l:
                print(f"{t:15} true={id2label[l]} pred={id2label[p]}")
                n -= 1
                if n == 0:
                    return


In [28]:
show_errors(trainer, val_dataset)


about           true=B-DATE pred=B-VALUE
45              true=I-DATE pred=B-DATE
belt            true=B-ORG pred=I-ORG
5000            true=B-VALUE pred=I-VALUE
65              true=I-DATE pred=I-VALUE
age             true=B-DATE pred=I-DATE
2011            true=B-VALUE pred=I-DATE
dollars         true=I-VALUE pred=I-DATE
$               true=B-VALUE pred=I-VALUE
20              true=I-VALUE pred=B-VALUE


In [29]:
def predict_ner(text):
    enc = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**enc)
    preds = outputs.logits.argmax(dim=-1).squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"].squeeze())

    return [(t, id2label[p]) for t, p in zip(tokens, preds)]


In [30]:
predict_ner("Apple reported revenue of $97.3B in Q4 2023.")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('[CLS]', 'I-DATE'),
 ('apple', 'B-ORG'),
 ('reported', 'I-ORG'),
 ('revenue', 'B-ORG'),
 ('of', 'B-DATE'),
 ('$', 'I-VALUE'),
 ('97', 'B-VALUE'),
 ('.', 'I-VALUE'),
 ('3b', 'I-VALUE'),
 ('in', 'I-DATE'),
 ('q4', 'B-DATE'),
 ('2023', 'I-DATE'),
 ('.', 'I-VALUE'),
 ('[SEP]', 'I-DATE')]

In [31]:
import torch

def predict_ner(text, tokenizer, model, id2label):
    model.eval()

    encoding = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128
    )

    with torch.no_grad():
        outputs = model(**encoding)

    predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze())

    results = []
    for token, pred in zip(tokens, predictions):
        label = id2label[pred]
        if label != "O":
            results.append((token, label))

    return results


In [32]:
text = "Apple reported revenue of $97.3B in Q4 2023."

predict_ner(text, tokenizer, model, id2label)


[('[CLS]', 'I-DATE'),
 ('apple', 'B-ORG'),
 ('reported', 'I-ORG'),
 ('revenue', 'B-ORG'),
 ('of', 'B-DATE'),
 ('$', 'I-VALUE'),
 ('97', 'B-VALUE'),
 ('.', 'I-VALUE'),
 ('3b', 'I-VALUE'),
 ('in', 'I-DATE'),
 ('q4', 'B-DATE'),
 ('2023', 'I-DATE'),
 ('.', 'I-VALUE'),
 ('[SEP]', 'I-DATE')]

In [33]:
def pretty_print_ner(text):
    preds = predict_ner(text, tokenizer, model, id2label)
    for token, label in preds:
        print(f"{token:15} → {label}")


In [34]:
!pip install pdfplumber transformers torch    #Milestone4


Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [35]:
import pdfplumber
import re
import json


In [38]:
from google.colab import files
files.upload()


Output hidden; open in https://colab.research.google.com to view.

In [40]:
doc_text = pdf_to_text("/content/annual_report(milestone4).pdf")
print(doc_text[:1000])


UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended September 27, 2025
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the transition period from to .
Commission File Number: 001-36743
Apple Inc.
(Exact name of Registrant as specified in its charter)
California 94-2404110
(State or other jurisdiction (I.R.S. Employer Identification No.)
of incorporation or organization)
One Apple Park Way
Cupertino, California 95014
(Address of principal executive offices) (Zip Code)
(408) 996-1010
(Registrant’s telephone number, including area code)
Securities registered pursuant to Section 12(b) of the Act:
Trading
Title of each class symbol(s) Name of each exchange on which registered
Common Stock, $0.00001 par value per share AAPL The Nasdaq Stock Market LLC
0.000% Notes due 2025 — The Nasdaq Stock

In [74]:
import pdfplumber

def pdf_to_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


raw_text = pdf_to_text("annual_report(milestone4).pdf")  # change path
print("Characters extracted:", len(raw_text))


Characters extracted: 273013


In [75]:
SECTION_HEADERS = {
    "MD&A": [
        "item 7. management’s discussion",
        "management’s discussion and analysis"
    ],
    "Market Risk": [
        "item 7a. quantitative and qualitative disclosures about market risk"
    ],
    "Risk Factors": [
        "item 1a. risk factors"
    ],
    "Financial Statements": [
        "item 8. financial statements"
    ]
}





In [76]:
def detect_sections(text, headers):
    sections = {}
    current_section = "Unknown"
    sections[current_section] = []

    for line in text.split("\n"):
        for section, keywords in headers.items():
            if any(k.lower() in line.lower() for k in keywords):
                current_section = section
                sections.setdefault(current_section, [])
        sections[current_section].append(line)

    return sections


sections = detect_sections(raw_text, SECTION_HEADERS)
print("Detected sections:", sections.keys())


Detected sections: dict_keys(['Unknown', 'Risk Factors', 'MD&A', 'Market Risk', 'Financial Statements'])


In [77]:
def store_section_text(sections):
    section_texts = {}
    for section, lines in sections.items():
        clean_lines = [l.strip() for l in lines if l.strip()]
        section_texts[section] = "\n".join(clean_lines)
    return section_texts


section_texts = store_section_text(sections)

print(section_texts.get("MD&A", "")[:400])





Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations 21
“Management’s Discussion and Analysis of Financial Condition and Results of Operations.” Forward-looking statements
provide current expectations of future events based on certain assumptions and include any statement that does not directly
relate to any historical or current fact. For example, statemen


In [78]:
import re

VALUE_PATTERN = re.compile(
    r'(\$|₹)?\s?\d+(?:\.\d+)?\s?(million|billion|mn|bn|crore|lakh|%)?',
    re.IGNORECASE
)

PERIOD_PATTERN = re.compile(
    r'(FY\s?\d{4}|Q[1-4]\s?\d{4}|quarter\s?ended|year\s?ended|\d{4})',
    re.IGNORECASE
)

METRIC_KEYWORDS = [
    "revenue",
    "net income",
    "profit",
    "loss",
    "cash flow",
    "earnings",
    "ebitda"
]


In [79]:
def extract_value_with_unit(text):
    match = VALUE_PATTERN.search(text)
    if match:
        return match.group().strip()
    return None


In [80]:
def extract_period(text):
    match = PERIOD_PATTERN.search(text)
    if match:
        return match.group().strip()
    return None


In [81]:
def apply_ner(section_text, section_name):
    records = []

    for line in section_text.split("\n"):
        l = line.lower()

        for metric in METRIC_KEYWORDS:
            if metric in l:
                value = extract_value_with_unit(line)
                period = extract_period(line)

                records.append({
                    "metric": metric,
                    "value": value,          # includes unit
                    "period": period,        # FY / Q / year
                    "text": line.strip(),
                    "section": section_name,
                    "type": "quantitative" if value else "qualitative"
                })
                break

    return records


In [82]:
NER_SECTIONS = ["MD&A", "Market Risk"]

ner_outputs = []

for section in NER_SECTIONS:
    if section in section_texts:
        ner_outputs.extend(
            apply_ner(section_texts[section], section)
        )

print("NER records extracted:", len(ner_outputs))


NER records extracted: 36


In [83]:
def is_table_line(line):
    return sum(c.isdigit() for c in line) > 10


def detect_tables(text):
    tables = []
    current = []

    for line in text.split("\n"):
        if is_table_line(line):
            current.append(line)
        else:
            if current:
                tables.append(current)
                current = []

    if current:
        tables.append(current)

    return tables


raw_tables = detect_tables(raw_text)
print("Tables detected:", len(raw_tables))


Tables detected: 220


In [84]:
def parse_table(table_lines, table_type="Unknown"):
    rows = []
    for line in table_lines:
        parts = line.split()
        if len(parts) >= 2:
            rows.append({
                "item": " ".join(parts[:-1]),
                "value": parts[-1]
            })
    return {
        "table_type": table_type,
        "rows": rows
    }


parsed_tables = [parse_table(t) for t in raw_tables]


In [89]:
final_output = {
    "company": "APPLE",
    "period": "UNKNOWN",
    "metrics": ner_outputs,
    "tables": parsed_tables
}


In [86]:
def validate(doc):
    assert "metrics" in doc
    assert "tables" in doc
    for m in doc["metrics"]:
        assert "section" in m
    print("Validation passed ✅")


validate(final_output)


Validation passed ✅


In [87]:
import json

with open("milestone_4_output.json", "w") as f:
    json.dump(final_output, f, indent=4)

print("Final output saved ✅")


Final output saved ✅


In [90]:
import json

output_file = "milestone_4_sample_output.json"

with open(output_file, "w") as f:
    json.dump(final_output, f, indent=4)

print("Saved output file:", output_file)
print("\n--- SAMPLE OUTPUT ---\n")
print(json.dumps(final_output, indent=4))


Saved output file: milestone_4_sample_output.json

--- SAMPLE OUTPUT ---

{
    "company": "APPLE",
    "period": "UNKNOWN",
    "metrics": [
        {
            "metric": "loss",
            "value": null,
            "period": null,
            "text": "services, and in many cases additional coverage for instances of accidental damage or theft and loss, depending on the country",
            "section": "MD&A",
            "type": "qualitative"
        },
        {
            "metric": "profit",
            "value": null,
            "period": null,
            "text": "provide products and services at little or no profit or even at a loss. The Company has a minority market share in the global",
            "section": "MD&A",
            "type": "qualitative"
        },
        {
            "metric": "revenue",
            "value": null,
            "period": null,
            "text": "and regions where the Company derives a significant portion of its revenues and/or has significa