<a href="https://colab.research.google.com/github/rutujapalatkar27/upgraded-octo-invention/blob/main/MCQs_Natural_Language_Processing_Coding_Questions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer

from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
from collections import defaultdict


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Question 11

In [27]:
# Example dictionary of words with their vectors
word_vectors = {
    'apple': np.array([[0.1, 0.2, 0.3]]),
    'banana': np.array([[0.2, 0.1, 0.4]]),
    'cherry': np.array([[0.3, 0.4, 0.2]]),
    'date': np.array([[0.1, 0.5, 0.2]]),
    'king': np.array([[0.4, 0.2, -0.3]])
}

def manhattan_distance(vec1, vec2):
    """
    Calculate the Manhattan distance between two vectors.

    Parameters:
    vec1 (numpy array): First vector.
    vec2 (numpy array): Second vector.

    Returns:
    float: Manhattan distance between vec1 and vec2.
    """

    # Calculate Manhattan distance
    distance = np.sum(np.abs(vec1 - vec2))
    return distance

def find_most_similar_word(target_word, word_vectors):
    # Check if the target word is in the dictionary
    if target_word not in word_vectors:
        raise ValueError(f"The word '{target_word}' is not in the dictionary.")

    # Get the vector for the target word
    target_vector = word_vectors[target_word]

    # Compute manhattan distance
    similarities = {}
    for word, vector in word_vectors.items():
        if word != target_word:
            # Compute manhattan distance
            dist = manhattan_distance(target_vector, vector)
            similarities[word] = dist

    # Find the most similar word
    most_similar_word = min(similarities, key=similarities.get)

    return most_similar_word

# Example usage
target_word = 'apple'
most_similar = find_most_similar_word(target_word, word_vectors)
print(f"The most similar word to '{target_word}' is '{most_similar}'.")

The most similar word to 'apple' is 'banana'.


Question 12

In [28]:
# Function for stemming a single word
def stem_word(word):
    """
    Stems a single word using the Porter Stemmer algorithm.

    Parameters:
    - word (str): The word to be stemmed.

    Returns:
    - str: The stemmed word.
    """
    # Initialize the Porter Stemmer
    stemmer = PorterStemmer()

    # Apply stemming to the word
    return stemmer.stem(word)

def convert_sentence(sentence):
    """
    Converts a sentence by applying stemming to each word.

    Parameters:
    - sentence (str): The sentence to be processed.

    Returns:
    - str: The sentence with each word stemmed.
    """
    # Split the sentence into words and apply stemming to each word
    output_sentence = sentence.split()
    output_sentence = [stem_word(word) for word in output_sentence]

    # Join the stemmed words back into a sentence
    return " ".join(output_sentence)

# Example usage
sentence = "the children are playing in the garden and enjoying their time."
# Convert the sentence by stemming each word
output_sentence = convert_sentence(sentence)
print(f"Output Sentence: {output_sentence}")

Output Sentence: the children are play in the garden and enjoy their time.


Question 13

In [29]:
def calculate_metrics(confusion_matrix):
    """
    Calculates precision, recall, and F1-score from a confusion matrix.

    Parameters:
    - confusion_matrix (list of lists): The confusion matrix in the form [[TN, FP], [FN, TP]]

    Returns:
    - tuple: (precision, recall, f1-score)
    """
    # Extract values from the confusion matrix
    tn, fp = confusion_matrix[0]
    fn, tp = confusion_matrix[1]

    # Calculate precision, recall, and F1-score
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1

# Example confusion matrix: [[TN, FP], [FN, TP]]
confusion_matrix = [[50, 10], [5, 35]]

# Calculate the metrics
precision, recall, f1 = calculate_metrics(confusion_matrix)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Precision: 0.78
Recall: 0.88
F1-score: 0.82


Question 14

In [30]:
def preprocess_text(text):
    """
    Preprocess the text by converting it to lowercase,
    removing punctuation, and tokenizing it into words.

    Parameters:
    - text (str): The input text to be preprocessed.

    Returns:
    - list: A list of words (tokens) from the text.
    """
    # Convert to lowercase
    text = text.lower()

    # Tokenize the text into words
    words = word_tokenize(text)

    return words

def build_vocabulary(corpus):
    """
    Build a vocabulary of unique words from the corpus.

    Parameters:
    - corpus (list of str): The list of documents (texts).

    Returns:
    - list: A list of unique words (vocabulary).
    """
    # Flatten the list of all words in the corpus
    all_words = [word for doc in corpus for word in preprocess_text(doc)]

    # Get unique words
    vocabulary = list(set(all_words))

    return vocabulary

def vectorize_text(text, vocabulary):
    """
    Create a Bag of Words vector for a given text.

    Parameters:
    - text (str): The input text to be vectorized.
    - vocabulary (list of str): The list of unique words (vocabulary).

    Returns:
    - list: A list of integers representing the BoW vector.
    """
    # Preprocess the text
    words = preprocess_text(text)

    # Create a Counter object to count word occurrences
    word_counts = Counter(words)

    # Create the vector based on the vocabulary
    vector = [word_counts[word] for word in vocabulary]

    return vector

# Example corpus of documents
corpus = [
    "The cat sat on the mat.",
    "The dog chased the cat.",
    "The cat climbed the tree.",
    "The bat hanging on the tree.",
    "The apple was kept on the table where the cat was sitting"
]

# Build the vocabulary from the corpus
vocabulary = build_vocabulary(corpus)

text = "The cat climbed on the where the bat was hanging."

# Vectorize each document in the corpus
bow_vector = vectorize_text(text, vocabulary)

# Print the Bag of Words vectors
print(f"Bag of words vector {bow_vector}")


Bag of words vector [1, 1, 0, 0, 1, 3, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0]


Question 15


### What is a Bigram?

A **bigram** is a pair of consecutive words in a sequence, often used in natural language processing (NLP) to analyze the relationship between words in a sentence. It helps in understanding the likelihood of a word following another word, which is useful in tasks like text prediction, speech recognition, and more.

### Example of a Bigram:

Consider the sentence: "I love pizza."

- The bigrams in this sentence are:
  - "I love"
  - "love pizza"

Each bigram represents a sequence of two consecutive words. If you have a large dataset of text, you can count how often each bigram appears. This helps you determine the probability of one word following another.

### Bigram Probability:

The **bigram probability** is the likelihood of the second word in the bigram appearing after the first word. For example, the probability of "pizza" appearing after "love" can be calculated using the formula:

$$
P(\text{next_word} \mid \text{word}) = \frac{\text{Count of bigram (word, next_word)}}{\text{Count of word}}
$$


This tells us how often "pizza" follows "love" compared to how often "love" appears overall.




In [31]:
class BigramProbability:
    def __init__(self):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)

    def tokenize(self, text):
        # Simple tokenization by splitting on non-alphabetical characters
        return text.lower().split()

    def train(self, sentences):
        # Calculate bigram and unigram counts
        for sentence in sentences:
            tokens = self.tokenize(sentence)
            for i in range(len(tokens) - 1):
                word = tokens[i]
                next_word = tokens[i + 1]
                self.unigram_counts[word] += 1
                self.bigram_counts[word][next_word] += 1

    def probability(self, word, next_word):
        # Calculate the conditional probability P(next_word | word)
        if self.unigram_counts[word] == 0:
            return 0.0
        return self.bigram_counts[word][next_word] / self.unigram_counts[word]

# Sample data
sentences = [
    "I love this movie, it's fantastic!",
    "This movie is horrible, I hate it.",
    "What a wonderful movie, amazing experience!",
    "The movie was the worst I've ever seen.",
    "I really enjoyed the movie, it was great.",
    "This was a terrible movie, never watching again."
]

# Create and train the BigramProbability model
bigram_model = BigramProbability()
bigram_model.train(sentences)

# Example: Find the probability of the word "movie" being followed by "is"
word = "movie"
next_word = "is"
prob = bigram_model.probability(word, next_word)
print(f"Probability of '{next_word}' occurring after '{word}': {prob:.2f}")

Probability of 'is' occurring after 'movie': 0.50
