<a href="https://colab.research.google.com/github/sakeththelu/NLP/blob/main/4082_NLP_assignment_7_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task : cosine jaccard and wordnet based similarity



In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (if not already downloaded)
# This is done only once
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("NLTK data (stopwords, wordnet, punkt) checked/downloaded.")

def preprocess_text(text):
    """
    Performs text cleaning operations: converting to lowercase, removing punctuation and numbers,
    tokenizing, removing stopwords, and lemmatizing words.
    """
    # 1. Convert to lowercase
    text = text.lower()
    print(f"Step 1 (Lowercase): {text}")

    # 2. Remove punctuation and numbers
    # Using regex to keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    print(f"Step 2 (Remove Punctuation/Numbers): {text}")

    # 3. Tokenize the text
    tokens = word_tokenize(text)
    print(f"Step 3 (Tokenization): {tokens}")

    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    print(f"Step 4 (Remove Stopwords): {filtered_tokens}")

    # 5. Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    print(f"Step 5 (Lemmatization): {lemmas}")

    # 6. Join tokens back into a string
    processed_text = ' '.join(lemmas)
    print(f"Step 6 (Join Tokens): {processed_text}")

    return processed_text

print("Preprocessing function 'preprocess_text' defined.")

NLTK data (stopwords, wordnet, punkt) checked/downloaded.
Preprocessing function 'preprocess_text' defined.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
sentence_pairs_df['Sentence1_processed'] = sentence_pairs_df['Sentence1'].apply(preprocess_text)
sentence_pairs_df['Sentence2_processed'] = sentence_pairs_df['Sentence2'].apply(preprocess_text)

print("Original and processed sentences added to DataFrame.")
sentence_pairs_df.head()

Step 1 (Lowercase): i like playing football
Step 2 (Remove Punctuation/Numbers): i like playing football
Step 3 (Tokenization): ['i', 'like', 'playing', 'football']
Step 4 (Remove Stopwords): ['like', 'playing', 'football']
Step 5 (Lemmatization): ['like', 'playing', 'football']
Step 6 (Join Tokens): like playing football
Step 1 (Lowercase): i like chicken biryani
Step 2 (Remove Punctuation/Numbers): i like chicken biryani
Step 3 (Tokenization): ['i', 'like', 'chicken', 'biryani']
Step 4 (Remove Stopwords): ['like', 'chicken', 'biryani']
Step 5 (Lemmatization): ['like', 'chicken', 'biryani']
Step 6 (Join Tokens): like chicken biryani
Step 1 (Lowercase): i love msdhoni
Step 2 (Remove Punctuation/Numbers): i love msdhoni
Step 3 (Tokenization): ['i', 'love', 'msdhoni']
Step 4 (Remove Stopwords): ['love', 'msdhoni']
Step 5 (Lemmatization): ['love', 'msdhoni']
Step 6 (Join Tokens): love msdhoni
Step 1 (Lowercase): i like playing football
Step 2 (Remove Punctuation/Numbers): i like playing f

Unnamed: 0,ID,Sentence1,Sentence2,cosine_similarity,jaccard_similarity,wordnet_similarity,Sentence1_processed,Sentence2_processed
0,1,i like playing football,i like playing cricket,0.455321,0.5,0.438889,like playing football,like playing cricket
1,2,i like chicken biryani,i like eating biryani,0.467251,0.5,0.358249,like chicken biryani,like eating biryani
2,3,i love MSDhoni,i like MSDhoni,0.565398,0.333333,0.5,love msdhoni,like msdhoni
3,4,i like playing football,i love MSDhoni,0.0,0.0,0.305556,like playing football,love msdhoni
4,5,i like chicken biryani,i like playing football,0.14901,0.2,0.272222,like chicken biryani,like playing football


In [31]:
import numpy as np

def create_bow_vectors(processed_sentences):
    """
    Generates Bag-of-Words (BoW) vectors for a list of preprocessed sentences
    and creates a vocabulary from all unique words.
    """
    # 1. Initialize an empty set vocabulary to store all unique words.
    vocabulary = set()

    # 2. Iterate through each processed_sentence to build the vocabulary
    for sentence in processed_sentences:
        words = sentence.split()
        for word in words:
            vocabulary.add(word)

    # 3. Convert the vocabulary set to a sorted list to maintain consistent ordering.
    vocabulary = sorted(list(vocabulary))
    print(f"Vocabulary created with {len(vocabulary)} unique words.")

    # 4. Initialize an empty list bow_vectors.
    bow_vectors = []

    # 5. For each processed_sentence, create its BoW vector
    for sentence in processed_sentences:
        # Create a BoW vector (a list or array of zeros) with a length equal to the size of the vocabulary.
        bow_vector = np.zeros(len(vocabulary), dtype=int)
        words = sentence.split()
        for word in words:
            if word in vocabulary:
                # Find its index in the vocabulary list.
                idx = vocabulary.index(word)
                # Increment the count at that index in the current sentence's BoW vector.
                bow_vector[idx] += 1
        bow_vectors.append(bow_vector)

    print("Bag-of-Words vectors generated.")

    # 6. Return the bow_vectors and the vocabulary list.
    return bow_vectors, vocabulary

print("Function 'create_bow_vectors' defined.")

Function 'create_bow_vectors' defined.


In [32]:
all_processed_sentences = sentence_pairs_df['Sentence1_processed'].tolist() + sentence_pairs_df['Sentence2_processed'].tolist()

bow_vectors, vocabulary = create_bow_vectors(all_processed_sentences)

print("BoW vectors and vocabulary generated for all processed sentences.")
print(f"First 5 BoW vectors:\n{bow_vectors[:5]}")
print(f"Vocabulary (first 10 words):\n{vocabulary[:10]}")

Vocabulary created with 9 unique words.
Bag-of-Words vectors generated.
BoW vectors and vocabulary generated for all processed sentences.
First 5 BoW vectors:
[array([0, 0, 0, 0, 1, 1, 0, 0, 1]), array([1, 1, 0, 0, 0, 1, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 1, 1, 0]), array([0, 0, 0, 0, 1, 1, 0, 0, 1]), array([1, 1, 0, 0, 0, 1, 0, 0, 0])]
Vocabulary (first 10 words):
['biryani', 'chicken', 'cricket', 'eating', 'football', 'like', 'love', 'msdhoni', 'playing']


## Represent Text Numerically (Bag-of-Words)


In [33]:
import numpy as np

# 2. Determine the number of sentence pairs in the sentence_pairs_df.
num_sentence_pairs = len(sentence_pairs_df)
print(f"Number of sentence pairs: {num_sentence_pairs}")

# 3. Split the bow_vectors list into two distinct lists:
# The total length of bow_vectors should be 2 * num_sentence_pairs

bow_vectors_s1 = bow_vectors[:num_sentence_pairs]
bow_vectors_s2 = bow_vectors[num_sentence_pairs:]

print(f"Length of bow_vectors_s1: {len(bow_vectors_s1)}")
print(f"Length of bow_vectors_s2: {len(bow_vectors_s2)}")

# 4. Convert both bow_vectors_s1 and bow_vectors_s2 into NumPy arrays.
bow_vectors_s1_np = np.array(bow_vectors_s1)
bow_vectors_s2_np = np.array(bow_vectors_s2)

print("Converted bow_vectors_s1 and bow_vectors_s2 to NumPy arrays.")

# 5. Print the shapes of the bow_vectors_s1 and bow_vectors_s2 NumPy arrays.
print(f"Shape of bow_vectors_s1_np: {bow_vectors_s1_np.shape}")
print(f"Shape of bow_vectors_s2_np: {bow_vectors_s2_np.shape}")

Number of sentence pairs: 5
Length of bow_vectors_s1: 5
Length of bow_vectors_s2: 5
Converted bow_vectors_s1 and bow_vectors_s2 to NumPy arrays.
Shape of bow_vectors_s1_np: (5, 9)
Shape of bow_vectors_s2_np: (5, 9)


# bow as a dataframe

In [45]:
# Create DataFrame for Sentence1 BoW vectors
sentence_labels = [f'Sentence {i+1}' for i in range(num_sentence_pairs)]
bow_df_s1 = pd.DataFrame(bow_vectors_s1_np, columns=vocabulary, index=sentence_labels)

print("Bag-of-Words DataFrame for Sentence1_processed:")
display(bow_df_s1)

# Create DataFrame for Sentence2 BoW vectors
bow_df_s2 = pd.DataFrame(bow_vectors_s2_np, columns=vocabulary, index=sentence_labels)

print("\nBag-of-Words DataFrame for Sentence2_processed:")
display(bow_df_s2)


Bag-of-Words DataFrame for Sentence1_processed:


Unnamed: 0,biryani,chicken,cricket,eating,football,like,love,msdhoni,playing
Sentence 1,0,0,0,0,1,1,0,0,1
Sentence 2,1,1,0,0,0,1,0,0,0
Sentence 3,0,0,0,0,0,0,1,1,0
Sentence 4,0,0,0,0,1,1,0,0,1
Sentence 5,1,1,0,0,0,1,0,0,0



Bag-of-Words DataFrame for Sentence2_processed:


Unnamed: 0,biryani,chicken,cricket,eating,football,like,love,msdhoni,playing
Sentence 1,0,0,1,0,0,1,0,0,1
Sentence 2,1,0,0,1,0,1,0,0,0
Sentence 3,0,0,0,0,0,1,0,1,0
Sentence 4,0,0,0,0,0,0,1,1,0
Sentence 5,0,0,0,0,1,1,0,0,1


## Compute Cosine Similarity



In [34]:
from sklearn.metrics.pairwise import cosine_similarity

print("Imported 'cosine_similarity' from sklearn.metrics.pairwise.")

Imported 'cosine_similarity' from sklearn.metrics.pairwise.


In [35]:
cosine_similarity_scores_bow = []

for i in range(len(sentence_pairs_df)):
    # Retrieve BoW vectors for Sentence1 and Sentence2
    vec1 = bow_vectors_s1_np[i].reshape(1, -1)
    vec2 = bow_vectors_s2_np[i].reshape(1, -1)

    # Calculate cosine similarity
    sim_score = cosine_similarity(vec1, vec2)[0][0]
    cosine_similarity_scores_bow.append(sim_score)

sentence_pairs_df['cosine_similarity_bow'] = cosine_similarity_scores_bow

print("Cosine similarity scores based on BoW vectors calculated and added to DataFrame.")
sentence_pairs_df.head()

Cosine similarity scores based on BoW vectors calculated and added to DataFrame.


Unnamed: 0,ID,Sentence1,Sentence2,cosine_similarity,jaccard_similarity,wordnet_similarity,Sentence1_processed,Sentence2_processed,cosine_similarity_bow
0,1,i like playing football,i like playing cricket,0.455321,0.5,0.438889,like playing football,like playing cricket,0.666667
1,2,i like chicken biryani,i like eating biryani,0.467251,0.5,0.358249,like chicken biryani,like eating biryani,0.666667
2,3,i love MSDhoni,i like MSDhoni,0.565398,0.333333,0.5,love msdhoni,like msdhoni,0.5
3,4,i like playing football,i love MSDhoni,0.0,0.0,0.305556,like playing football,love msdhoni,0.0
4,5,i like chicken biryani,i like playing football,0.14901,0.2,0.272222,like chicken biryani,like playing football,0.333333


## Compute Jaccard Similarity



In [36]:
def jaccard_similarity(text1, text2):
    """
    Calculates the Jaccard Similarity between two preprocessed text strings.
    """
    # 1. Split text into words and convert to sets
    set1 = set(text1.split())
    set2 = set(text2.split())

    # 2. Calculate the intersection of the two sets
    intersection = len(set1.intersection(set2))

    # 3. Calculate the union of the two sets
    union = len(set1.union(set2))

    # 4. Compute Jaccard similarity, handling division by zero
    if union == 0:
        return 0.0
    return intersection / union

print("Function 'jaccard_similarity' defined.")

Function 'jaccard_similarity' defined.


In [37]:
jaccard_similarity_scores_bow = []

for index, row in sentence_pairs_df.iterrows():
    sentence1_processed = row['Sentence1_processed']
    sentence2_processed = row['Sentence2_processed']

    score = jaccard_similarity(sentence1_processed, sentence2_processed)
    jaccard_similarity_scores_bow.append(score)

sentence_pairs_df['jaccard_similarity_bow'] = jaccard_similarity_scores_bow

print("Jaccard similarity scores based on processed texts calculated and added to DataFrame.")
sentence_pairs_df.head()

Jaccard similarity scores based on processed texts calculated and added to DataFrame.


Unnamed: 0,ID,Sentence1,Sentence2,cosine_similarity,jaccard_similarity,wordnet_similarity,Sentence1_processed,Sentence2_processed,cosine_similarity_bow,jaccard_similarity_bow
0,1,i like playing football,i like playing cricket,0.455321,0.5,0.438889,like playing football,like playing cricket,0.666667,0.5
1,2,i like chicken biryani,i like eating biryani,0.467251,0.5,0.358249,like chicken biryani,like eating biryani,0.666667,0.5
2,3,i love MSDhoni,i like MSDhoni,0.565398,0.333333,0.5,love msdhoni,like msdhoni,0.5,0.333333
3,4,i like playing football,i love MSDhoni,0.0,0.0,0.305556,like playing football,love msdhoni,0.0,0.0
4,5,i like chicken biryani,i like playing football,0.14901,0.2,0.272222,like chicken biryani,like playing football,0.333333,0.2


## Compute WordNet Semantic Similarity



In [38]:
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

print("Imported 'wordnet' from nltk.corpus and 'word_tokenize' from nltk.tokenize.")

def get_wordnet_pos(word):
    """
    Maps NLTK POS tag to WordNet POS tag for lemmatization and synset retrieval.
    """
    # Get the first letter of the POS tag
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def wordnet_similarity(text1, text2):
    """
    Calculates the WordNet-based semantic similarity between two preprocessed text strings.
    Uses wup_similarity between individual words and averages for sentences.
    """
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)

    # Handle empty token lists
    if not tokens1 or not tokens2:
        return 0.0

    # Get all possible synsets for each word with appropriate POS tags
    synsets1 = []
    for word in tokens1:
        pos = get_wordnet_pos(word)
        synsets1.extend(wordnet.synsets(word, pos=pos))

    synsets2 = []
    for word in tokens2:
        pos = get_wordnet_pos(word)
        synsets2.extend(wordnet.synsets(word, pos=pos))

    if not synsets1 or not synsets2:
        return 0.0

    # Calculate pairwise Wu-Palmer similarity between all synsets
    scores = []
    for ss1 in synsets1:
        max_sim = 0.0
        for ss2 in synsets2:
            sim = ss1.wup_similarity(ss2)
            if sim is not None and sim > max_sim:
                max_sim = sim
        if max_sim > 0.0: # Only add if a valid similarity was found for ss1
            scores.append(max_sim)

    if not scores:
        return 0.0

    return sum(scores) / len(scores)

print("Functions 'get_wordnet_pos' and 'wordnet_similarity' defined.")

Imported 'wordnet' from nltk.corpus and 'word_tokenize' from nltk.tokenize.
Functions 'get_wordnet_pos' and 'wordnet_similarity' defined.


In [40]:
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import nltk

# Download necessary NLTK data (if not already downloaded)
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

print("Imported 'wordnet' from nltk.corpus and 'word_tokenize' from nltk.tokenize.")

def get_wordnet_pos(word):
    """
    Maps NLTK POS tag to WordNet POS tag for lemmatization and synset retrieval.
    """
    # Get the first letter of the POS tag
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def wordnet_similarity(text1, text2):
    """
    Calculates the WordNet-based semantic similarity between two preprocessed text strings.
    Uses wup_similarity between individual words and averages for sentences.
    """
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)

    # Handle empty token lists
    if not tokens1 or not tokens2:
        return 0.0

    # Get all possible synsets for each word with appropriate POS tags
    synsets1 = []
    for word in tokens1:
        pos = get_wordnet_pos(word)
        synsets1.extend(wordnet.synsets(word, pos=pos))

    synsets2 = []
    for word in tokens2:
        pos = get_wordnet_pos(word)
        synsets2.extend(wordnet.synsets(word, pos=pos))

    if not synsets1 or not synsets2:
        return 0.0

    # Calculate pairwise Wu-Palmer similarity between all synsets
    scores = []
    for ss1 in synsets1:
        max_sim = 0.0
        for ss2 in synsets2:
            sim = ss1.wup_similarity(ss2)
            if sim is not None and sim > max_sim:
                max_sim = sim
        if max_sim > 0.0: # Only add if a valid similarity was found for ss1
            scores.append(max_sim)

    if not scores:
        return 0.0

    return sum(scores) / len(scores)

print("Functions 'get_wordnet_pos' and 'wordnet_similarity' defined.")

Imported 'wordnet' from nltk.corpus and 'word_tokenize' from nltk.tokenize.
Functions 'get_wordnet_pos' and 'wordnet_similarity' defined.


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [42]:
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import nltk

# Download necessary NLTK data (if not already downloaded)
try:
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
    nltk.download('averaged_perceptron_tagger_eng')

print("Imported 'wordnet' from nltk.corpus and 'word_tokenize' from nltk.tokenize.")

def get_wordnet_pos(word):
    """
    Maps NLTK POS tag to WordNet POS tag for lemmatization and synset retrieval.
    """
    # Get the first letter of the POS tag
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def wordnet_similarity(text1, text2):
    """
    Calculates the WordNet-based semantic similarity between two preprocessed text strings.
    Uses wup_similarity between individual words and averages for sentences.
    """
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)

    # Handle empty token lists
    if not tokens1 or not tokens2:
        return 0.0

    # Get all possible synsets for each word with appropriate POS tags
    synsets1 = []
    for word in tokens1:
        pos = get_wordnet_pos(word)
        synsets1.extend(wordnet.synsets(word, pos=pos))

    synsets2 = []
    for word in tokens2:
        pos = get_wordnet_pos(word)
        synsets2.extend(wordnet.synsets(word, pos=pos))

    if not synsets1 or not synsets2:
        return 0.0

    # Calculate pairwise Wu-Palmer similarity between all synsets
    scores = []
    for ss1 in synsets1:
        max_sim = 0.0
        for ss2 in synsets2:
            sim = ss1.wup_similarity(ss2)
            if sim is not None and sim > max_sim:
                max_sim = sim
        if max_sim > 0.0: # Only add if a valid similarity was found for ss1
            scores.append(max_sim)

    if not scores:
        return 0.0

    return sum(scores) / len(scores)

print("Functions 'get_wordnet_pos' and 'wordnet_similarity' defined.")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Imported 'wordnet' from nltk.corpus and 'word_tokenize' from nltk.tokenize.
Functions 'get_wordnet_pos' and 'wordnet_similarity' defined.


In [43]:
wordnet_similarity_scores_bow = []

for index, row in sentence_pairs_df.iterrows():
    sentence1_processed = row['Sentence1_processed']
    sentence2_processed = row['Sentence2_processed']

    score = wordnet_similarity(sentence1_processed, sentence2_processed)
    wordnet_similarity_scores_bow.append(score)

sentence_pairs_df['wordnet_similarity_bow'] = wordnet_similarity_scores_bow

print("WordNet semantic similarity scores based on processed texts calculated and added to DataFrame.")
sentence_pairs_df.head()

WordNet semantic similarity scores based on processed texts calculated and added to DataFrame.


Unnamed: 0,ID,Sentence1,Sentence2,cosine_similarity,jaccard_similarity,wordnet_similarity,Sentence1_processed,Sentence2_processed,cosine_similarity_bow,jaccard_similarity_bow,wordnet_similarity_bow
0,1,i like playing football,i like playing cricket,0.455321,0.5,0.438889,like playing football,like playing cricket,0.666667,0.5,0.981546
1,2,i like chicken biryani,i like eating biryani,0.467251,0.5,0.358249,like chicken biryani,like eating biryani,0.666667,0.5,0.597439
2,3,i love MSDhoni,i like MSDhoni,0.565398,0.333333,0.5,love msdhoni,like msdhoni,0.5,0.333333,0.253949
3,4,i like playing football,i love MSDhoni,0.0,0.0,0.305556,like playing football,love msdhoni,0.0,0.0,0.204836
4,5,i like chicken biryani,i like playing football,0.14901,0.2,0.272222,like chicken biryani,like playing football,0.333333,0.2,0.524602
