In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkit')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Error loading punkit: Package 'punkit' not found in index
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [3]:
def pos_tagger(text):
    words=word_tokenize(text) #Toknize text into word
    pos_tags=pos_tag(words) #Get POS tags
    return pos_tags
    
    

In [4]:
sentence="The quick brown fox jumps over the lazy dog."
pos_tags=pos_tagger(sentence)

In [5]:
print("POS Tags:",pos_tags)

POS Tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


# NER ( Named Entitiy Recognistion)

In [6]:
!pip install spacy

Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of mkl-fft to determine which version is compatible with other requirements. This could take a while.
Collecting mkl_fft (from numpy>=1.19.0->spacy)
  Downloading mkl_fft-2.0.0-22-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (7.1 kB)
Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hINFO: pip is looking at multiple versions of mkl-random to determine which version is compatible with other requirements. This could take a while.
Collecting mkl_random (from numpy>=1.19.0->spacy

In [8]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [9]:
def ner_extraction(text):
    doc=nlp(text) 
    return [(ent.text,ent.label_) for ent in doc.ents]


In [12]:
sentence="Apple Inc. was founded by Steve Jobs in California in 1978."
entities=ner_extraction(sentence)

print("named Entities:",entities)

named Entities: [('Apple Inc.', 'ORG'), ('Steve Jobs', 'PERSON'), ('California', 'GPE'), ('1978', 'DATE')]


# N-grams

In [14]:
from collections import Counter
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
def generate_ngrams(text,n):
    words=word_tokenize(text)
    n_grams=list(ngrams(words,n))
    return [" ".join(gram) for gram in n_grams]
   


In [17]:
text="I love Natural Language Processing"
n=2
bigrams=generate_ngrams(text,n)
print("Bigrams:",bigrams)

Bigrams: ['I love', 'love Natural', 'Natural Language', 'Language Processing']


# Prediction 

In [19]:
def train_ngram_model(text,n=2):
    words=word_tokenize(text.lower()) # Tokenize and convert lower
    n_grams=list(ngrams(words,n)) # generate n-grams
    model=Counter(n_grams) # Count occurrences
    return model

In [20]:
# Sample dataset(real-world applications need a large corpus)
text="I love natural language preprocessing. I love machine learning and deep learning"


In [21]:
bigram_model=train_ngram_model(text,n=2)
print("Bigram Model:",bigram_model)

Bigram Model: Counter({('i', 'love'): 2, ('love', 'natural'): 1, ('natural', 'language'): 1, ('language', 'preprocessing'): 1, ('preprocessing', '.'): 1, ('.', 'i'): 1, ('love', 'machine'): 1, ('machine', 'learning'): 1, ('learning', 'and'): 1, ('and', 'deep'): 1, ('deep', 'learning'): 1})


In [26]:
def predict_next_word(model, input_text, n=2):
    words = word_tokenize(input_text.lower())
    prev_words = tuple(words[-(n-1):])  # Last n-1 words
    candidates = {}

    for k, v in model.items():
        if k[:-1] == prev_words:  # Match preceding words
            candidates[k] = v

    if not candidates:
        return "No prediction found"

    print("Candidates:", candidates)
    next_word = max(candidates, key=candidates.get)[-1]
    return next_word

In [27]:
input_text="I love"
predicted_word=predict_next_word(bigram_model,input_text,n=2)
print("Predicted Next Words",predicted_word)

Candidates: {('love', 'natural'): 1, ('love', 'machine'): 1}
Predicted Next Words natural
