In [1]:
print("NLP Text Preprocessing")

NLP Text Preprocessing


In [2]:
import nltk

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
text = """
Page 1
The total revenue of the company was $10,000 in Q2 / 2025, which represents a 15% increase compared to the same period last year.
The operating margin improved from 12% to 18%, while expenses related to marketing and infrastructure grew by 5%.
In Q3 / 2025, net profit reached $4,200, driven by higher demand and improved cost efficiency.
Page 2
The company plans to invest $2,500 in research and development over the next 6 months to support long-term growth.
"""

In [5]:
import re

# Removing 'Page 1' and 'Page 2',etc
cleaned_text = re.sub(r'Page\s+\d+', '', text)

# Removing extra spaces and new lines
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

print("CLEANED TEXT:")
print(cleaned_text)


CLEANED TEXT:
The total revenue of the company was $10,000 in Q2 / 2025, which represents a 15% increase compared to the same period last year. The operating margin improved from 12% to 18%, while expenses related to marketing and infrastructure grew by 5%. In Q3 / 2025, net profit reached $4,200, driven by higher demand and improved cost efficiency. The company plans to invest $2,500 in research and development over the next 6 months to support long-term growth.


In [6]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(cleaned_text)

print("TOKENS:")
print(tokens)

TOKENS:
['The', 'total', 'revenue', 'of', 'the', 'company', 'was', '$', '10,000', 'in', 'Q2', '/', '2025', ',', 'which', 'represents', 'a', '15', '%', 'increase', 'compared', 'to', 'the', 'same', 'period', 'last', 'year', '.', 'The', 'operating', 'margin', 'improved', 'from', '12', '%', 'to', '18', '%', ',', 'while', 'expenses', 'related', 'to', 'marketing', 'and', 'infrastructure', 'grew', 'by', '5', '%', '.', 'In', 'Q3', '/', '2025', ',', 'net', 'profit', 'reached', '$', '4,200', ',', 'driven', 'by', 'higher', 'demand', 'and', 'improved', 'cost', 'efficiency', '.', 'The', 'company', 'plans', 'to', 'invest', '$', '2,500', 'in', 'research', 'and', 'development', 'over', 'the', 'next', '6', 'months', 'to', 'support', 'long-term', 'growth', '.']


In [7]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

filtered_tokens = [
    word for word in tokens 
    if word.lower() not in stop_words
]

print("TOKENS AFTER REMOVING STOP WORDS:")
print(filtered_tokens)

TOKENS AFTER REMOVING STOP WORDS:
['total', 'revenue', 'company', '$', '10,000', 'Q2', '/', '2025', ',', 'represents', '15', '%', 'increase', 'compared', 'period', 'last', 'year', '.', 'operating', 'margin', 'improved', '12', '%', '18', '%', ',', 'expenses', 'related', 'marketing', 'infrastructure', 'grew', '5', '%', '.', 'Q3', '/', '2025', ',', 'net', 'profit', 'reached', '$', '4,200', ',', 'driven', 'higher', 'demand', 'improved', 'cost', 'efficiency', '.', 'company', 'plans', 'invest', '$', '2,500', 'research', 'development', 'next', '6', 'months', 'support', 'long-term', 'growth', '.']


In [8]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

print("STEMMING RESULT:")
print(stemmed_words)

STEMMING RESULT:
['total', 'revenu', 'compani', '$', '10,000', 'q2', '/', '2025', ',', 'repres', '15', '%', 'increas', 'compar', 'period', 'last', 'year', '.', 'oper', 'margin', 'improv', '12', '%', '18', '%', ',', 'expens', 'relat', 'market', 'infrastructur', 'grew', '5', '%', '.', 'q3', '/', '2025', ',', 'net', 'profit', 'reach', '$', '4,200', ',', 'driven', 'higher', 'demand', 'improv', 'cost', 'effici', '.', 'compani', 'plan', 'invest', '$', '2,500', 'research', 'develop', 'next', '6', 'month', 'support', 'long-term', 'growth', '.']


In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("LEMMATIZATION RESULT:")
print(lemmatized_words)

LEMMATIZATION RESULT:
['total', 'revenue', 'company', '$', '10,000', 'Q2', '/', '2025', ',', 'represents', '15', '%', 'increase', 'compared', 'period', 'last', 'year', '.', 'operating', 'margin', 'improved', '12', '%', '18', '%', ',', 'expense', 'related', 'marketing', 'infrastructure', 'grew', '5', '%', '.', 'Q3', '/', '2025', ',', 'net', 'profit', 'reached', '$', '4,200', ',', 'driven', 'higher', 'demand', 'improved', 'cost', 'efficiency', '.', 'company', 'plan', 'invest', '$', '2,500', 'research', 'development', 'next', '6', 'month', 'support', 'long-term', 'growth', '.']


In [10]:
# Comparison:
# Stemming is faster but may distort financial terms.
# Lemmatization preserves correct word meaning, therefore is more accurate.
# Therefore, Lemmatization is more suitable for financial text analysis.