In [8]:
### 1

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download resources once
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)                          # Tokenization
    tokens = [t for t in tokens if t.isalpha()]                # Filtration
    tokens = [t for t in tokens if re.fullmatch(r"[A-Za-z]+", t)]  # Script validation
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t.lower() not in stop_words]    # Stop word removal
    stemmer = PorterStemmer()
    return [stemmer.stem(t) for t in tokens]                   # Stemming

text = "NLP preprocessing includes Tokenization, cleaning the text, removing stopwords, and more!"
print("Processed Text:", preprocess_text(text))


Processed Text: ['nlp', 'preprocess', 'includ', 'token', 'clean', 'text', 'remov', 'stopword']


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
### 2 

import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

# Corpus
corpus = """
I love natural language processing.
I love machine learning.
Language models learn patterns.
I love learning new things.
"""

# Tokenize
tokens = word_tokenize(corpus.lower())

# Unigram, Bigram, Trigram models
unigrams = FreqDist(tokens)
bigrams = ConditionalFreqDist(ngrams(tokens, 2))
trigrams = ConditionalFreqDist(((a, b), c) for a, b, c in ngrams(tokens, 3))

# Probability functions
def unigram_prob(w): 
    return unigrams[w] / unigrams.N()

def bigram_prob(w1, w2):
    return bigrams[w1][w2] / bigrams[w1].N() if bigrams[w1][w2] else 0

def trigram_prob(w1, w2, w3):
    return trigrams[(w1, w2)][w3] / trigrams[(w1, w2)].N() if trigrams[(w1, w2)][w3] else 0

# Sentence probability
def sentence_probability(sentence):
    words = word_tokenize(sentence.lower())

    uni = 1
    for w in words:
        uni *= unigram_prob(w)

    bi = 1
    for w1, w2 in ngrams(words, 2):
        bi *= bigram_prob(w1, w2)

    tri = 1
    for w1, w2, w3 in ngrams(words, 3):
        tri *= trigram_prob(w1, w2, w3)

    return uni, bi, tri

# Test sentences
sentences = ["I love learning", "language models learn", "learning models love"]

for s in sentences:
    up, bp, tp = sentence_probability(s)
    print(f"\nSentence: '{s}'")
    print("Unigram Probability: ", up)
    print("Bigram Probability : ", bp)
    print("Trigram Probability:", tp)



Sentence: 'I love learning'
Unigram Probability:  0.0016904583020285497
Bigram Probability :  0.3333333333333333
Trigram Probability: 0.3333333333333333

Sentence: 'language models learn'
Unigram Probability:  0.00018782870022539445
Bigram Probability :  0.5
Trigram Probability: 1.0

Sentence: 'learning models love'
Unigram Probability:  0.0005634861006761833
Bigram Probability :  0
Trigram Probability: 0


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
### 3

def med(a, b):
    m, n = len(a), len(b)
    dp = [[i + j if i * j == 0 else 0 for j in range(n + 1)] for i in range(m + 1)]
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + (a[i-1] != b[j-1]))
    return dp[-1][-1], dp

def show(dp, a, b):
    print("   " + "  ".join("_" + b))
    for i, r in enumerate(dp):
        print((a[i-1] if i else "_"), r)

tests = [
    ("cat", "cut"),
    ("cat", "cats"),
    ("cats", "cat"),
    ("speling", "spelling"),
    ("flaw", "lawn"),
    ("intention", "execution")
]

print("\n========= Minimum Edit Distance Demonstration =========")
for s1, s2 in tests:
    print(f"\n{s1} → {s2}")
    d, dp = med(s1, s2)
    print("MED =", d)
    show(dp, s1, s2)




cat → cut
MED = 1
   _  c  u  t
_ [0, 1, 2, 3]
c [1, 0, 1, 2]
a [2, 1, 1, 2]
t [3, 2, 2, 1]

cat → cats
MED = 1
   _  c  a  t  s
_ [0, 1, 2, 3, 4]
c [1, 0, 1, 2, 3]
a [2, 1, 0, 1, 2]
t [3, 2, 1, 0, 1]

cats → cat
MED = 1
   _  c  a  t
_ [0, 1, 2, 3]
c [1, 0, 1, 2]
a [2, 1, 0, 1]
t [3, 2, 1, 0]
s [4, 3, 2, 1]

speling → spelling
MED = 1
   _  s  p  e  l  l  i  n  g
_ [0, 1, 2, 3, 4, 5, 6, 7, 8]
s [1, 0, 1, 2, 3, 4, 5, 6, 7]
p [2, 1, 0, 1, 2, 3, 4, 5, 6]
e [3, 2, 1, 0, 1, 2, 3, 4, 5]
l [4, 3, 2, 1, 0, 1, 2, 3, 4]
i [5, 4, 3, 2, 1, 1, 1, 2, 3]
n [6, 5, 4, 3, 2, 2, 2, 1, 2]
g [7, 6, 5, 4, 3, 3, 3, 2, 1]

flaw → lawn
MED = 2
   _  l  a  w  n
_ [0, 1, 2, 3, 4]
f [1, 1, 2, 3, 4]
l [2, 1, 2, 3, 4]
a [3, 2, 1, 2, 3]
w [4, 3, 2, 1, 2]

intention → execution
MED = 5
   _  e  x  e  c  u  t  i  o  n
_ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
i [1, 1, 2, 3, 4, 5, 6, 6, 7, 8]
n [2, 2, 2, 3, 4, 5, 6, 7, 7, 7]
t [3, 3, 3, 3, 4, 5, 5, 6, 7, 8]
e [4, 3, 4, 3, 4, 5, 6, 6, 7, 8]
n [5, 4, 4, 4, 4, 5, 6, 7, 7, 7]
t 

In [11]:
### 4

from collections import Counter

docs = [
    ("fun couple love love".split(), "comedy"),
    ("fast furious shoot".split(), "action"),
    ("couple fly fast fun fun".split(), "comedy"),
    ("furious shoot shoot fun".split(), "action"),
    ("fly fast shoot love".split(), "action")
]
D = "fast couple shoot fly".split()

classes = {"comedy", "action"}
priors = {c: sum(c == cls for _, cls in docs) / len(docs) for c in classes}

vocab = {w for words, _ in docs for w in words}; V = len(vocab)
wc = {c: Counter() for c in classes}; tw = {c: 0 for c in classes}
for words, c in docs: wc[c].update(words); tw[c] += len(words)

def P(c):
    p = priors[c]
    for w in D: p *= (wc[c][w] + 1) / (tw[c] + V)
    return p

p_comedy, p_action = P("comedy"), P("action")
print(f"P(Comedy | D) = {p_comedy}")
print(f"P(Action | D) = {p_action}")
print("Predicted:", "action" if p_action > p_comedy else "comedy")


P(Comedy | D) = 7.324218750000001e-05
P(Action | D) = 0.00017146776406035664
Predicted: action


In [12]:
### 5


import os, nltk
from nltk.corpus import brown, inaugural, reuters, udhr, PlaintextCorpusReader
from nltk.probability import ConditionalFreqDist

for p in ['brown','inaugural','reuters','udhr','punkt_tab','averaged_perceptron_tagger_eng','universal_tagset']: nltk.download(p)

print("Brown Categories:", brown.categories())
print("Brown Words:", brown.words()[:20])
print("Brown Sentence:", brown.sents()[0])
print("Brown Raw:\n", brown.raw()[:200])

print("\nInaugural File IDs:", inaugural.fileids()[:5])
print("Inaugural 2009 Words:", inaugural.words('2009-Obama.txt')[:20])

print("\nReuters Categories:", reuters.categories()[:10])
print("Reuters Example Words:", reuters.words('training/9865')[:15])

print("\nUDHR Languages:", udhr.fileids()[:10])
print("UDHR English Words:", udhr.words('English-Latin1')[:20])

root="mycorpus"; sp=os.path.join(root,"sports"); te=os.path.join(root,"tech")
os.makedirs(sp,exist_ok=True); os.makedirs(te,exist_ok=True)
open(os.path.join(sp,"sports1.txt"),"w").write("The team won the match with excellent performance.")
open(os.path.join(te,"tech1.txt"),"w").write("Artificial intelligence is transforming the technology industry.")

mycorpus=PlaintextCorpusReader(root,r".*\.txt")
cfd=ConditionalFreqDist()
for fid in mycorpus.fileids():
    cat=fid.split('/')[0]
    for w in mycorpus.words(fid): cfd[cat][w.lower()]+=1
print("\nSports Word Counts:", cfd["sports"].most_common(10))
print("Tech Word Counts:", cfd["tech"].most_common(10))

all_words=mycorpus.words()
tagged_words=nltk.pos_tag(all_words)
tagged_sents=[nltk.pos_tag(s) for s in mycorpus.sents()]
print("\nTagged Words:", tagged_words[:10])
print("Tagged Sentence:", tagged_sents[0])

noun_tags={"NN","NNS","NNP","NNPS"}
freq=nltk.FreqDist(t for _,t in tagged_words if t in noun_tags)
print("\nMost Frequent Noun Tags:", freq.most_common())

default_tagger=nltk.DefaultTagger("NN")
print("\nRule-based Tagger:", default_tagger.tag(["This","is","a","sample","sentence"]))

print("\nUnigram Tagger:")
split=int(0.8*len(tagged_sents)); train,test=tagged_sents[:split],tagged_sents[split:]
uni=nltk.tag.UnigramTagger(train, backoff=default_tagger)
print("Accuracy:", uni.accuracy(test) if test else "No test sentences available.")
print("Custom Sentence Tagging:", uni.tag("AI is changing the world".split()))


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package inaugural to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package udhr to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package udhr is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[

Brown Categories: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
Brown Words: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']
Brown Sentence: ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
Brown Raw:
 

	The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' tha

Inaugural File IDs: ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt']
Inaugu

In [13]:
### 6

import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

word = "active"
synonyms = set()
antonyms = set()

for syn in wordnet.synsets(word):
    for lemma in syn.lemmas():
        synonyms.add(lemma.name())
        if lemma.antonyms():
            antonyms.add(lemma.antonyms()[0].name())

print("Synonyms of 'active':")
print(sorted(synonyms))

print("\nAntonyms of 'active':")
print(sorted(antonyms))


Synonyms of 'active':
['active', 'active_agent', 'active_voice', 'alive', 'combat-ready', 'dynamic', 'fighting', 'participating']

Antonyms of 'active':
['dormant', 'extinct', 'inactive', 'passive', 'passive_voice', 'quiet', 'stative']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
### 7

import nltk
from nltk.corpus import brown
from nltk.tag.hmm import HiddenMarkovModelTrainer

for p in ['brown','punkt_tab','universal_tagset']: nltk.download(p)

tagged = brown.tagged_sents(tagset='universal'); split = int(0.9*len(tagged))
train, test = tagged[:split], tagged[split:]
print("Training sentences:", len(train))

hmm = HiddenMarkovModelTrainer().train_supervised(train)
print("HMM Tagger Accuracy:", hmm.accuracy(test))

while True:
    s = input("\nEnter a sentence (or 'exit' to quit): ")
    if s.lower() == "exit": break
    print("Tagged Sentence:", hmm.tag(nltk.word_tokenize(s)))


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Training sentences: 51606
HMM Tagger Accuracy: 0.7011378638335324



Enter a sentence (or 'exit' to quit):  quit


Tagged Sentence: [('quit', 'VERB')]



Enter a sentence (or 'exit' to quit):  exit
