<a href="https://colab.research.google.com/github/Ashwitha-Pabba/NLP/blob/main/Assignment_7_4_2403a52223_NLP_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [None]:

import numpy as np
import pandas as pd
import nltk
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer


**Create Dataset**

In [None]:
data = [
    ("The doctor is treating a patient", "A physician is helping a sick person"),
    ("I love machine learning", "I enjoy studying AI"),
    ("The cat sits on the mat", "The cat is sitting on the mat"),
    ("He plays football", "She is cooking dinner"),
    ("Weather is very hot today", "It is extremely warm outside"),
    ("Python is a programming language", "Bananas are yellow"),
    ("I am reading a book", "I am studying from a textbook"),
    ("Cars move fast", "Vehicles travel quickly"),
    ("She likes music", "She enjoys songs"),
    ("Open the door", "Close the window")
]

df = pd.DataFrame(data, columns=["Sentence1", "Sentence2"])

print(df.head())

                          Sentence1                             Sentence2
0  The doctor is treating a patient  A physician is helping a sick person
1           I love machine learning                   I enjoy studying AI
2           The cat sits on the mat         The cat is sitting on the mat
3                 He plays football                 She is cooking dinner
4         Weather is very hot today          It is extremely warm outside


**Preprocessing**

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df["clean1"] = df["Sentence1"].apply(preprocess)
df["clean2"] = df["Sentence2"].apply(preprocess)

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Sentence1,Sentence2,clean1,clean2
0,The doctor is treating a patient,A physician is helping a sick person,doctor treating patient,physician helping sick person
1,I love machine learning,I enjoy studying AI,love machine learning,enjoy studying ai
2,The cat sits on the mat,The cat is sitting on the mat,cat sits mat,cat sitting mat
3,He plays football,She is cooking dinner,play football,cooking dinner
4,Weather is very hot today,It is extremely warm outside,weather hot today,extremely warm outside


**TF-IDF Represntation**

In [None]:
vectorizer = TfidfVectorizer()
combined = df["clean1"].tolist() + df["clean2"].tolist()

tfidf_matrix = vectorizer.fit_transform(combined)


**Cosine Similarity**

In [None]:
cosine_scores = []

for i in range(len(df)):
    v1 = tfidf_matrix[i]
    v2 = tfidf_matrix[i + len(df)]
    score = cosine_similarity(v1, v2)[0][0]
    cosine_scores.append(score)

df["cosine_similarity"] = cosine_scores
df[["Sentence1", "Sentence2", "cosine_similarity"]].head(10)

Unnamed: 0,Sentence1,Sentence2,cosine_similarity
0,The doctor is treating a patient,A physician is helping a sick person,0.0
1,I love machine learning,I enjoy studying AI,0.0
2,The cat sits on the mat,The cat is sitting on the mat,0.607125
3,He plays football,She is cooking dinner,0.0
4,Weather is very hot today,It is extremely warm outside,0.0
5,Python is a programming language,Bananas are yellow,0.0
6,I am reading a book,I am studying from a textbook,0.0
7,Cars move fast,Vehicles travel quickly,0.0
8,She likes music,She enjoys songs,0.0
9,Open the door,Close the window,0.0


**Jaccard Similarity**

In [None]:
def jaccard_similarity(s1, s2):
    set1 = set(s1.split())
    set2 = set(s2.split())
    return len(set1 & set2) / len(set1 | set2)

df["jaccard_similarity"] = df.apply(
    lambda row: jaccard_similarity(row["clean1"], row["clean2"]),
    axis=1
)

df[["Sentence1", "Sentence2", "jaccard_similarity"]].head(10)

Unnamed: 0,Sentence1,Sentence2,jaccard_similarity
0,The doctor is treating a patient,A physician is helping a sick person,0.0
1,I love machine learning,I enjoy studying AI,0.0
2,The cat sits on the mat,The cat is sitting on the mat,0.5
3,He plays football,She is cooking dinner,0.0
4,Weather is very hot today,It is extremely warm outside,0.0
5,Python is a programming language,Bananas are yellow,0.0
6,I am reading a book,I am studying from a textbook,0.0
7,Cars move fast,Vehicles travel quickly,0.0
8,She likes music,She enjoys songs,0.0
9,Open the door,Close the window,0.0


**WordNet Similarity**

In [None]:
def wordnet_sentence_similarity(s1, s2):
    tokens1 = s1.split()
    tokens2 = s2.split()

    total_score = 0
    count = 0

    for w1 in tokens1:
        syn1 = wordnet.synsets(w1)
        if not syn1:
            continue

        best_score = 0
        for w2 in tokens2:
            syn2 = wordnet.synsets(w2)
            if not syn2:
                continue

            score = syn1[0].wup_similarity(syn2[0])
            if score and score > best_score:
                best_score = score

        if best_score:
            total_score += best_score
            count += 1

    return total_score / count if count > 0 else 0

df["wordnet_similarity"] = df.apply(
    lambda row: wordnet_sentence_similarity(row["clean1"], row["clean2"]), axis=1
)

df[["Sentence1", "Sentence2", "wordnet_similarity"]].head(10)


Unnamed: 0,Sentence1,Sentence2,wordnet_similarity
0,The doctor is treating a patient,A physician is helping a sick person,0.65
1,I love machine learning,I enjoy studying AI,0.352273
2,The cat sits on the mat,The cat is sitting on the mat,0.727273
3,He plays football,She is cooking dinner,0.388158
4,Weather is very hot today,It is extremely warm outside,0.335979
5,Python is a programming language,Bananas are yellow,0.349542
6,I am reading a book,I am studying from a textbook,0.946779
7,Cars move fast,Vehicles travel quickly,0.664327
8,She likes music,She enjoys songs,0.534314
9,Open the door,Close the window,0.503289


**Comparison Summary**

In [None]:
df[["cosine_similarity", "jaccard_similarity", "wordnet_similarity"]].describe()


Unnamed: 0,cosine_similarity,jaccard_similarity,wordnet_similarity
count,10.0,10.0,10.0
mean,0.060712,0.05,0.545193
std,0.19199,0.158114,0.201495
min,0.0,0.0,0.335979
25%,0.0,0.0,0.361244
50%,0.0,0.0,0.518802
75%,0.0,0.0,0.660746
max,0.607125,0.5,0.946779
