In [None]:
task4_preprocessing.ipynb


In [1]:
with open("output_text/docling.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(text[:500])


## UNITED STATES SECURITIES AND EXCHANGE COMMISSION

Washington, D.C. 20549

## FORM 10-K

(Mark One)

☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the fiscal year ended September 28, 2024

or

- [ ] ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the transition period from              to             .

Commission File Number:

001-36743

## Apple Inc.

(Exact name of Registrant as specified in its ch


In [2]:
import re

clean_text = re.sub(r"[^A-Za-z0-9$%/.\s]", " ", text)
clean_text = re.sub(r"\s+", " ", clean_text)

print(clean_text[:500])


 UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington D.C. 20549 FORM 10 K Mark One ANNUAL REPORT PURSUANT TO SECTION 13 OR 15 d OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended September 28 2024 or TRANSITION REPORT PURSUANT TO SECTION 13 OR 15 d OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to . Commission File Number 001 36743 Apple Inc. Exact name of Registrant as specified in its charter California 94 2404110 State or other jurisdiction of inco


In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(clean_text)

tokens = [token.text for token in doc]
tokens[:20]


[' ',
 'UNITED',
 'STATES',
 'SECURITIES',
 'AND',
 'EXCHANGE',
 'COMMISSION',
 'Washington',
 'D.C.',
 '20549',
 'FORM',
 '10',
 'K',
 'Mark',
 'One',
 'ANNUAL',
 'REPORT',
 'PURSUANT',
 'TO',
 'SECTION']

In [6]:
filtered_tokens = [
    token.text for token in doc
    if not token.is_stop and not token.is_space
]

filtered_tokens[:20]


['UNITED',
 'STATES',
 'SECURITIES',
 'EXCHANGE',
 'COMMISSION',
 'Washington',
 'D.C.',
 '20549',
 'FORM',
 '10',
 'K',
 'Mark',
 'ANNUAL',
 'REPORT',
 'PURSUANT',
 'SECTION',
 '13',
 '15',
 'd',
 'SECURITIES']

In [7]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed = [stemmer.stem(word) for word in filtered_tokens]

stemmed[:20]


['unit',
 'state',
 'secur',
 'exchang',
 'commiss',
 'washington',
 'd.c.',
 '20549',
 'form',
 '10',
 'k',
 'mark',
 'annual',
 'report',
 'pursuant',
 'section',
 '13',
 '15',
 'd',
 'secur']

In [8]:
lemmatized = [
    token.lemma_.lower().strip()
    for token in doc
    if not token.is_stop and not token.is_space
]

lemmatized[:20]


['united',
 'states',
 'securities',
 'exchange',
 'commission',
 'washington',
 'd.c.',
 '20549',
 'form',
 '10',
 'k',
 'mark',
 'annual',
 'report',
 'pursuant',
 'section',
 '13',
 '15',
 'd',
 'securities']

In [9]:
for i in range(15):
    print(filtered_tokens[i], "→", stemmed[i], "/", lemmatized[i])


UNITED → unit / united
STATES → state / states
SECURITIES → secur / securities
EXCHANGE → exchang / exchange
COMMISSION → commiss / commission
Washington → washington / washington
D.C. → d.c. / d.c.
20549 → 20549 / 20549
FORM → form / form
10 → 10 / 10
K → k / k
Mark → mark / mark
ANNUAL → annual / annual
REPORT → report / report
PURSUANT → pursuant / pursuant
