1.Word Analysis in NLP

In [None]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
def analyze_word(word):
  synsets = wordnet.synsets(word)
  if not synsets:
    return "No analysis found."
  result = f"\nAnalysis of '{word}':\n"
  for syn in synsets[:3]:
    synonyms = {lemma.name().replace('_', ' ') for lemma in syn.lemmas()}
    antonyms = {lemma.antonyms()[0].name().replace('_', ' ') for lemma in syn.lemmas() if
    lemma.antonyms()}
    result += f"\n- Definition: {syn.definition()}\n"
    result += f"- Part of Speech: {syn.pos()}\n"
    if syn.examples():
      result += f"- Example: {syn.examples()[0]}\n"
      result += f"- Synonyms: {', '.join(synonyms) or 'None'}\n"
      result += f"- Antonyms: {', '.join(antonyms) or 'None'}\n"
      result += "-" * 30
  return result
print(analyze_word(input("Enter:")))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Enter:package

Analysis of 'package':

- Definition: a collection of things wrapped or boxed together
- Part of Speech: n

- Definition: a wrapped container
- Part of Speech: n

- Definition: (computer science) written programs or procedures or rules and associated documentation pertaining to the operation of a computer system and that are stored in read/write memory
- Part of Speech: n
- Example: the market for software is expected to expand
- Synonyms: package, software package, computer software, software system, software, software program
- Antonyms: hardware
------------------------------


2.Word Generation in NLP

In [None]:
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
def get_related_words(word):
  synsets = wn.synsets(word)
  related_words = set()
  for syn in synsets:
    for lemma in syn.lemmas():
      related_words.add(lemma.name().replace("_", " "))
    for hyper in syn.hypernyms():
      for lemma in hyper.lemmas():
        related_words.add(lemma.name().replace("_", " "))
    for hypo in syn.hyponyms():
      for lemma in hypo.lemmas():
        related_words.add(lemma.name().replace("_", " "))
  return list(related_words)[:10]
print(" WordNet-based Word Explorer")
word = input("Enter a word: ").lower().strip()
related = get_related_words(word)
if related:
  print("\n Related Words:", ", ".join(related))
else:
  print("\n No related words found in WordNet.")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


 WordNet-based Word Explorer
Enter a word: glad

 Related Words: iridaceous plant, gladiolus, beaming, sword lily, gladiola, glad, happy


3.Morphological Analysis in NLP

In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')
def morphology_analysis(word):
# Create instances for stemmer and lemmatizer
  ps = PorterStemmer()
  lemmatizer = WordNetLemmatizer()
  # Perform stemming and lemmatization
  stemmed_word = ps.stem(word)
  lemmatized_word = lemmatizer.lemmatize(word)
  # Display the results
  print("\nOriginal Word:", word)
  print("Stemmed Word:", stemmed_word)
  print("Lemmatized Word:", lemmatized_word)
# Main function
if __name__ == "__main__":
  word = input("Enter a word for morphological analysis: ")
  morphology_analysis(word)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Enter a word for morphological analysis: running

Original Word: running
Stemmed Word: run
Lemmatized Word: running


4.N-Gram Analysis in NLP

In [None]:
import re
from collections import defaultdict
def preprocess(text):
  """Convert text to lowercase and remove punctuation."""
  return re.sub(r'[^\w\s]', '', text.lower())
def generate_ngrams(text, n):
  """Generate n-grams from text."""
  words = text.split()
  return [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
def ngram_model(text, n):
  """Create an n-gram model with frequency counts."""
  ngrams = generate_ngrams(preprocess(text), n)
  ngram_count = defaultdict(int)
  for ngram in ngrams:

    ngram_count[ngram] += 1
  return ngram_count
# ---- User Interaction ----
print(" N-gram Frequency Model")
user_text = input("Enter your text:\n")
try:
  n = int(input("\nEnter the n-gram size (e.g., 1 for unigrams, 2 for bigrams): "))
  if n < 1:
    raise ValueError
except ValueError:
  print("Invalid n-gram size. Please enter a positive integer.")
else:
  model = ngram_model(user_text, n)
  print(f"\n{n}-gram Frequencies:")
  for k, v in model.items():
    print(f"{k}: {v}")

 N-gram Frequency Model
Enter your text:
no boys are good and no girl

Enter the n-gram size (e.g., 1 for unigrams, 2 for bigrams): 3

3-gram Frequencies:
('no', 'boys', 'are'): 1
('boys', 'are', 'good'): 1
('are', 'good', 'and'): 1
('good', 'and', 'no'): 1
('and', 'no', 'girl'): 1


5.N-Gram Smoothing in NLP

In [None]:
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
# Download the punkt_tab resource
nltk.download('punkt_tab')
nltk.download('punkt') #ensure punkt is also downloaded.
def generate_ngrams(text, n):
  words = nltk.word_tokenize(text.lower()) # Tokenization
  ngram_list = list(ngrams(words, n)) # Generate n-grams
  return ngram_list
def compute_smoothed_probabilities(ngrams_list, ngram_counts, unigram_counts,vocab_size):
  smoothed_probs = {}
  for ngram in ngrams_list:
    prefix = ngram[:-1] # Previous (n-1) words
    word = ngram[-1] # Current word
    count_ngram = ngram_counts[ngram]
    count_prefix = unigram_counts[prefix] if prefix in unigram_counts else 0
    # Apply Laplace Smoothing
    smoothed_prob = (count_ngram + 1) / (count_prefix + vocab_size)
    smoothed_probs[ngram] = smoothed_prob
  return smoothed_probs
# Main function

def main():
  text = input("Enter a sentence: ")
  n = int(input("Enter the value of N for N-grams: "))
  # Generate n-grams
  ngrams_list = generate_ngrams(text, n)
  # Compute n-gram counts
  ngram_counts = Counter(ngrams_list)
  unigram_counts = Counter(generate_ngrams(text, n - 1)) if n > 1 else Counter(nltk.word_tokenize(text.lower()))
  vocab_size = len(set(nltk.word_tokenize(text.lower()))) # Vocabulary size
  # Compute smoothed probabilities
  smoothed_probs = compute_smoothed_probabilities(ngrams_list, ngram_counts,
  unigram_counts, vocab_size)
  # Display results
  print("\nN-grams and their Smoothed Probabilities:")
  for ngram, prob in smoothed_probs.items():
    print(f"{ngram} -> {prob:.4f}")
if __name__ == "__main__":
  main()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Enter a sentence: the dog chaes a cat
Enter the value of N for N-grams: 2

N-grams and their Smoothed Probabilities:
('the', 'dog') -> 0.3333
('dog', 'chaes') -> 0.3333
('chaes', 'a') -> 0.3333
('a', 'cat') -> 0.3333


6.POS Tagging using Hidden Markov Model (HMM) in NLP

In [None]:
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm
from nltk.tokenize import word_tokenize
# Ensure necessary NLTK resources are downloaded
nltk.download('treebank')
nltk.download('punkt')
nltk.download('punkt_tab') # Download the punkt_tab resource
# Load the Treebank corpus and split into training and test sets
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]
# Initialize and train the HMM tagger
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train(train_sents)
# Function to tag a new sentence

def pos_tag_sentence(sentence):
  tokens = word_tokenize(sentence)
  tagged = tagger.tag(tokens)
  return tagged
# Get user input
sentence = input("Enter a sentence to POS tag: ")
# Tag the sentence and display the result
tagged_sentence = pos_tag_sentence(sentence)
print("\nTagged Sentence:")
for word, tag in tagged_sentence:
  print(f"{word}: {tag}")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Enter a sentence to POS tag: she enjoys reading book


  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])
  P[i] = self._priors.logprob(si)



Tagged Sentence:
she: PRP
enjoys: NNP
reading: NNP
book: NNP


  O[i, k] = self._output_logprob(si, self._symbols[k])


7.POS Tagging using Viterbi Decoding

In [None]:
import numpy as np
# POS tags and probabilities
states = ['Noun', 'Verb', 'Adjective']
start_prob = {'Noun': 0.5, 'Verb': 0.3, 'Adjective': 0.2}
transition_prob = {
'Noun': {'Noun': 0.1, 'Verb': 0.6, 'Adjective': 0.3},
'Verb': {'Noun': 0.4, 'Verb': 0.3, 'Adjective': 0.3},
'Adjective': {'Noun': 0.5, 'Verb': 0.2, 'Adjective': 0.3}
}
emission_prob = {
'Noun': {'dog': 0.4, 'cat': 0.4, 'runs': 0.1, 'fast': 0.1},
'Verb': {'dog': 0.1, 'cat': 0.1, 'runs': 0.6, 'fast': 0.2},
'Adjective': {'dog': 0.1, 'cat': 0.1, 'runs': 0.2, 'fast': 0.6}
}

def viterbi(sentence):
  words = sentence.lower().split()
  n, m = len(words), len(states)
  viterbi_matrix = np.zeros((m, n))
  backpointer = np.zeros((m, n), dtype=int)
  for i, state in enumerate(states):
    viterbi_matrix[i, 0] = start_prob[state] * emission_prob[state].get(words[0], 0.01)
  for t in range(1, n):
    for i, state in enumerate(states):
      probs = [(viterbi_matrix[j, t-1] * transition_prob[prev][state] * emission_prob[state].get(words[t], 0.01), j)for j, prev in enumerate(states)]
    viterbi_matrix[i, t], backpointer[i, t] = max(probs)
  best_path = [np.argmax(viterbi_matrix[:, -1])]
  for t in range(n-1, 0, -1):
    best_path.insert(0, backpointer[best_path[0], t])
  return list(zip(words, [states[i] for i in best_path]))
# User input and tagging
sentence = input("Enter a sentence: ")
print("\nPOS Tagged Sentence:")
for word, tag in viterbi(sentence):
  print(f"{word} ---> {tag}")

Enter a sentence: apple is a fruit

POS Tagged Sentence:
apple ---> Noun
is ---> Adjective
a ---> Adjective
fruit ---> Adjective


8.Building a POS Tagger in NLP

In [None]:
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

def pos_tagger_spacy(sentence):
    doc = nlp(sentence)
    print("\nPOS Tagging Results using SpaCy:")
    for token in doc:
        print(f"{token.text} ---> {token.pos_} ({token.tag_})")

sentence = input("Enter a sentence: ")
pos_tagger_spacy(sentence)


Enter a sentence: she is a good girl

POS Tagging Results using SpaCy:
she ---> PRON (PRP)
is ---> AUX (VBZ)
a ---> DET (DT)
good ---> ADJ (JJ)
girl ---> NOUN (NN)


9.Chunking in NLP

In [None]:
import spacy
from spacy import displacy

# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

def chunking_tree_spacy(sentence):
    # Process the sentence using SpaCy
    doc = nlp(sentence)

    print("\nNoun Phrase Chunks (SpaCy):")
    for chunk in doc.noun_chunks:
        print(chunk.text)

    # Displaying tree visualization using displacy
    print("\nDependency Tree Visualization:")
    displacy.render(doc, style="dep", jupyter=True, options={"compact": True, "distance": 100})

sentence = input("Enter a sentence: ")
chunking_tree_spacy(sentence)


Enter a sentence: she is a bad idea girl

Noun Phrase Chunks (SpaCy):
she
a bad idea girl

Dependency Tree Visualization:


10.Building a Chunker in NLP

In [None]:
# Step 1: Install and import necessary libraries
import nltk
from nltk import word_tokenize, pos_tag
from nltk.chunk import RegexpParser
# Download required resources
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
# Step 2: Tokenize and perform POS tagging on a sample sentence
sentence = input("enter the sting")
tokens = word_tokenize(sentence)
tagged_tokens = pos_tag(tokens)
print("POS Tagged Tokens:")
print(tagged_tokens)
# Step 3: Define a custom chunking grammar using regular expressions
chunk_grammar = r"""
NP: {<DT>?<JJ>*<NN.*>} # Noun Phrase
VP: {<VB.*><NP|PP|CLAUSE>*} # Verb Phrase
"""
# Step 4: Apply chunking using RegexpParser
chunk_parser = RegexpParser(chunk_grammar)
chunk_tree = chunk_parser.parse(tagged_tokens)
# Step 5: Display and visualize the extracted chunks
print("\nChunked Sentence Tree:")
chunk_tree.pprint()

# Optional: Visualize the chunk tree in a popup window (works only in desktop environments)
try:
  chunk_tree.draw()
except:
  print("Tree visualization is not supported in this environment.")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


enter the stingthe quick brown fox jumbs over lazy dog
POS Tagged Tokens:
[('the', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumbs', 'NN'), ('over', 'IN'), ('lazy', 'JJ'), ('dog', 'NN')]

Chunked Sentence Tree:
(S
  (NP the/DT quick/JJ brown/NN)
  (NP fox/NN)
  (NP jumbs/NN)
  over/IN
  (NP lazy/JJ dog/NN))
Tree visualization is not supported in this environment.
