In [1]:
!pip install nltk




In [6]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Reading the file

In [3]:
with open("sample1.md", "r", encoding="utf-8") as f:
    text = f.read()

print("Total characters:", len(text))
print("\nPreview of text:\n")
print(text[:500])


Total characters: 110788

Preview of text:

(Mark One)

- ☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the quarterly period ended June 28, 2025

or

- [ ] ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the transition period from              to             .

Commission File Number:

001-36743

<!-- image -->

(Exact name of Registrant as specified in its charter)

California

(State or other jurisdiction of incorporation or organization)

O


Text cleaning

In [8]:
import re

def clean_text(text):
    # Remove HTML comments like <!-- image -->
    text = re.sub(r'<!--.*?-->', ' ', text)

    # Remove page numbers or isolated numbers
    text = re.sub(r'\n\d+\n', '\n', text)

    # Normalize whitespace (multiple spaces → single space)
    text = re.sub(r'\s+', ' ', text)

    # Keep important financial symbols
    text = re.sub(r'[^a-zA-Z0-9$%/., ]', '', text)

    return text.strip()

cleaned_text = clean_text(text)

print("Cleaned text preview:\n")
print(cleaned_text[:500])

Cleaned text preview:

Mark One   QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15d OF THE SECURITIES EXCHANGE ACT OF 1934 For the quarterly period ended June 28, 2025 or     TRANSITION REPORT PURSUANT TO SECTION 13 OR 15d OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to . Commission File Number 00136743 Exact name of Registrant as specified in its charter California State or other jurisdiction of incorporation or organization One Apple Park Way Cupertino, California Address of principal executive


Tokenisation

In [9]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(cleaned_text)

print("Total tokens:", len(tokens))
print("Sample tokens:\n")
print(tokens[:30])


Total tokens: 11392
Sample tokens:

['Mark', 'One', 'QUARTERLY', 'REPORT', 'PURSUANT', 'TO', 'SECTION', '13', 'OR', '15d', 'OF', 'THE', 'SECURITIES', 'EXCHANGE', 'ACT', 'OF', '1934', 'For', 'the', 'quarterly', 'period', 'ended', 'June', '28', ',', '2025', 'or', 'TRANSITION', 'REPORT', 'PURSUANT']


Removing-stop words

In [11]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

filtered_tokens = [
    word for word in tokens
    if word.lower() not in stop_words
    and any(char.isalnum() for char in word)
]

print("Tokens after stopword removal:", len(filtered_tokens))
print(filtered_tokens[:30])

Tokens after stopword removal: 7206
['Mark', 'One', 'QUARTERLY', 'REPORT', 'PURSUANT', 'SECTION', '13', '15d', 'SECURITIES', 'EXCHANGE', 'ACT', '1934', 'quarterly', 'period', 'ended', 'June', '28', '2025', 'TRANSITION', 'REPORT', 'PURSUANT', 'SECTION', '13', '15d', 'SECURITIES', 'EXCHANGE', 'ACT', '1934', 'transition', 'period']


Stemming

In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

print("Sample stemmed tokens:\n")
print(stemmed_tokens[:30])


Sample stemmed tokens:

['mark', 'one', 'quarterli', 'report', 'pursuant', 'section', '13', '15d', 'secur', 'exchang', 'act', '1934', 'quarterli', 'period', 'end', 'june', '28', '2025', 'transit', 'report', 'pursuant', 'section', '13', '15d', 'secur', 'exchang', 'act', '1934', 'transit', 'period']


Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("Sample lemmatized tokens:\n")
print(lemmatized_tokens[:30])


Sample lemmatized tokens:

['Mark', 'One', 'QUARTERLY', 'REPORT', 'PURSUANT', 'SECTION', '13', '15d', 'SECURITIES', 'EXCHANGE', 'ACT', '1934', 'quarterly', 'period', 'ended', 'June', '28', '2025', 'TRANSITION', 'REPORT', 'PURSUANT', 'SECTION', '13', '15d', 'SECURITIES', 'EXCHANGE', 'ACT', '1934', 'transition', 'period']


Comparision

In [16]:
for i in range(15):
    print(
        f"Original: {filtered_tokens[i]} | "
        f"Stemmed: {stemmed_tokens[i]} | "
        f"Lemmatized: {lemmatized_tokens[i]}"
    )

Original: Mark | Stemmed: mark | Lemmatized: Mark
Original: One | Stemmed: one | Lemmatized: One
Original: QUARTERLY | Stemmed: quarterli | Lemmatized: QUARTERLY
Original: REPORT | Stemmed: report | Lemmatized: REPORT
Original: PURSUANT | Stemmed: pursuant | Lemmatized: PURSUANT
Original: SECTION | Stemmed: section | Lemmatized: SECTION
Original: 13 | Stemmed: 13 | Lemmatized: 13
Original: 15d | Stemmed: 15d | Lemmatized: 15d
Original: SECURITIES | Stemmed: secur | Lemmatized: SECURITIES
Original: EXCHANGE | Stemmed: exchang | Lemmatized: EXCHANGE
Original: ACT | Stemmed: act | Lemmatized: ACT
Original: 1934 | Stemmed: 1934 | Lemmatized: 1934
Original: quarterly | Stemmed: quarterli | Lemmatized: quarterly
Original: period | Stemmed: period | Lemmatized: period
Original: ended | Stemmed: end | Lemmatized: ended
