In [22]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# Import the necessary libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob # Used for simple spelling correction

corpus_content = """
The quik brown fox jumpd over the lazy dog. Dogs r often seen as man's best friend. I am going to the store right now to purchas some milk and braed. Machine learning is a feild of study that gives computrs the ability to learn without being explicity programd. This is the fourth and final sentence.
"""
# Create and write the corpus content to 'file.txt'
with open('file.txt', 'w') as f:
    f.write(corpus_content.strip())

# Load the text corpus to a variable
with open('file.txt', 'r') as f:
    text_corpus = f.read()

print("✅ Libraries imported and 'file.txt' loaded to 'text_corpus' variable.")

✅ Libraries imported and 'file.txt' loaded to 'text_corpus' variable.


In [27]:
# Apply the tokenization process to the text corpus
tokens = word_tokenize(text_corpus)

# Print the first 30 tokens
print("-------Tokens (First 30)---------")
print(tokens[:30])

-------Tokens (First 30)---------
['The', 'quik', 'brown', 'fox', 'jumpd', 'over', 'the', 'lazy', 'dog', '.', 'Dogs', 'r', 'often', 'seen', 'as', 'man', "'s", 'best', 'friend', '.', 'I', 'am', 'going', 'to', 'the', 'store', 'right', 'now', 'to', 'purchas']


In [29]:
# Apply spelling correction on each token
corrected_tokens = []
corrected_text = []

# TextBlob is often more effective for whole-text correction
blob = TextBlob(text_corpus)
corrected_text_corpus = str(blob.correct())

# Re-tokenize the corrected text for subsequent steps
corrected_tokens = word_tokenize(corrected_text_corpus)

# Print the initial 10 corrected tokens
print("-------- Corrected Tokens (First 10) -------")
print(corrected_tokens[:10])

# Print the corrected text corpus
print("\n--------- Corrected Text Corpus ----------")
print(corrected_text_corpus)

-------- Corrected Tokens (First 10) -------
['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.']

--------- Corrected Text Corpus ----------
The quick brown fox jumped over the lazy dog. Dogs r often seen as man's best friend. I am going to the store right now to purchase some milk and bread. Machine learning is a field of study that gives computers the ability to learn without being explicitly program. His is the fourth and final sentence.


In [30]:
# Apply POS tags to each corrected token
pos_tags = nltk.tag.pos_tag(corrected_tokens)

print("------- POS Tags (First 15)-----------")
print(pos_tags[:15])

------- POS Tags (First 15)-----------
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('Dogs', 'NNP'), ('r', 'NN'), ('often', 'RB'), ('seen', 'VBN'), ('as', 'IN')]


In [35]:
# Remove stop words from the corrected token list
# Get the English stop words list
english_stop_words = set(stopwords.words('english'))

# Filter the corrected tokens
filtered_tokens = [token for token in corrected_tokens if token.lower() not in english_stop_words and token.isalpha()]

# Print the initial 20 tokens
print("---------Filtered Tokens (Stop Words Removed - First 20)--------------")
print(filtered_tokens[:20])

---------Filtered Tokens (Stop Words Removed - First 20)--------------
['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog', 'Dogs', 'r', 'often', 'seen', 'man', 'best', 'friend', 'going', 'store', 'right', 'purchase', 'milk', 'bread', 'Machine']


In [32]:
# Apply stemming and lemmatization to the corrected token list

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

print("------------Stemmed Tokens (First 20)------------")
print(stemmed_tokens[:20])

print("\n----------Lemmatized Tokens (First 20)----------")
print(lemmatized_tokens[:20])

------------Stemmed Tokens (First 20)------------
['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'dog', 'r', 'often', 'seen', 'man', 'best', 'friend', 'go', 'store', 'right', 'purchas', 'milk', 'bread', 'machin']

----------Lemmatized Tokens (First 20)----------
['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog', 'Dogs', 'r', 'often', 'seen', 'man', 'best', 'friend', 'going', 'store', 'right', 'purchase', 'milk', 'bread', 'Machine']


In [33]:
# Detect the sentence boundaries in the given text corpus
# Use the sent_tokenize function from NLTK
sentences = sent_tokenize(text_corpus)

# Print the total number of sentences
total_sentences = len(sentences)

print("---------Sentence Boundary Detection-----------")
print(f"Total number of sentences detected: <<<{total_sentences}>>>")
print("\nAll Sentences:")
for i, sent in enumerate(sentences):
    print(f"{i+1}. {sent}")

---------Sentence Boundary Detection-----------
Total number of sentences detected: <<<5>>>

All Sentences:
1. The quik brown fox jumpd over the lazy dog.
2. Dogs r often seen as man's best friend.
3. I am going to the store right now to purchas some milk and braed.
4. Machine learning is a feild of study that gives computrs the ability to learn without being explicity programd.
5. This is the fourth and final sentence.
