<a href="https://colab.research.google.com/github/smrutipunto/NLP/blob/main/Prcatical_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Implement N-gram Model
import nltk
from collections import defaultdict, Counter

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

class UnigramModel:
    def __init__(self):
        self.word_freq = Counter()

    def preprocess(self, text):
        # Tokenization and cleaning
        tokens = nltk.word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum()]  # Removing punctuation
        return tokens

    def fit(self, text):
        tokens = self.preprocess(text)
        # Count word frequencies
        self.word_freq.update(tokens)

        # Debug: Print word frequencies
        print("Word frequencies:")
        for word, freq in self.word_freq.items():
            print(f"{word}: {freq}")

    def predict(self, word):
        # Return the top words based on frequency
        most_common = self.word_freq.most_common(3)
        return most_common

14
# Example usage
if __name__ == "__main__":
    # Sample text corpus
    corpus = """
    Natural language processing (NLP) is a subfield of artificial intelligence (AI).
    It enables computers to understand, interpret, and generate human language.
    With advances in machine learning and deep learning, NLP has made significant strides.
    Applications include sentiment analysis, machine translation, and chatbot development.
    """

    # Create and fit unigram model
    unigram_model = UnigramModel()
    unigram_model.fit(corpus)

    # Make predictions
    print("Top 3 predicted words based on frequency:")
    predictions = unigram_model.predict('any_word')
    print(predictions)  # Should show the top 3 words based on their frequencies

In [None]:
import nltk
from collections import defaultdict, Counter

nltk.download('punkt_tab')

class NGramModel:
    def __init__(self, n):
        self.n = n
        self.ngrams_freq = defaultdict(Counter)

    def preprocess(self, text):
        # Tokenization and cleaning
        tokens = nltk.word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum()]  # Removing punctuation
        return tokens

    def fit_bigram(self, text):
        tokens = self.preprocess(text)
        # Generate n-grams and count frequencies
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            self.ngrams_freq[ngram[:-1]][ngram[-1]] += 1  # Count the frequency of the last word

            # Debug: Print the generated ngram
            print(f"Generated bigram: {ngram[:-1]} -> {ngram[-1]}")

    def predict_bigram(self, prefix):
        prefix = self.preprocess(prefix)
        if len(prefix) < self.n - 1:
            return []  # Not enough words to form a prediction
        prefix_tuple = tuple(prefix[-(self.n - 1):])  # Get the last (n-1) words
        # Debug: Print the prefix tuple being checked
        print(f"Checking prefix: {prefix_tuple}")

        if prefix_tuple in self.ngrams_freq:
            next_words = self.ngrams_freq[prefix_tuple]
            return next_words.most_common(3)  # Return top 3 predictions
        else:
            return []

if __name__ == "__main__":
    # Sample text corpus
    corpus = """
    Natural language processing (NLP) is a subfield of artificial intelligence (AI).
    It enables computers to understand, interpret, and generate human language.
    With advances in machine learning and deep learning, NLP has made significant strides.
    Applications include sentiment analysis, machine translation, and chatbot development.
    """
    # Create and fit bigram model
    bigram_model = NGramModel(n=2)
    bigram_model.fit_bigram(corpus)

    # Make predictions
    print("Bigram Predictions for 'natural language':")
    predictions = bigram_model.predict_bigram('natural language')
    print(predictions)  # Should only include 'processing'