<a href="https://colab.research.google.com/github/siddhesh1503/NLP/blob/main/WSD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk scikit-learn pandas


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import string

def normalize(words):
    return [w.lower().strip(string.punctuation) for w in words if w.strip(string.punctuation)]

def simplified_lesk(context_sentence, ambiguous_word, pos=None):
    context = normalize(word_tokenize(context_sentence))
    max_overlap = 0
    best_sense = None

    synsets = wn.synsets(ambiguous_word, pos=pos) if pos else wn.synsets(ambiguous_word)
    if not synsets:
        return None

    for sense in synsets:
        signature = []
        signature += normalize(word_tokenize(sense.definition() or ""))
        for ex in sense.examples():
            signature += normalize(word_tokenize(ex))
        signature += [lemma.lower() for lemma in [l.replace('_', ' ') for l in sense.lemma_names()]]
        overlap = len(set(signature) & set(context))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense

# Test sentences
sents = [
    ("I went to the bank to deposit my paycheck.", "bank"),
    ("The river overflowed the bank after the heavy rain.", "bank"),
    ("He sat on the bank and watched the ducks.", "bank"),
    ("I need to charge my phone", "charge"),
    ("The soldier received a charge by the court", "charge")
]

for sent, word in sents:
    sense = simplified_lesk(sent, word)
    print(f"Sentence: {sent}")
    if sense:
        print(f"Predicted sense: {sense.name()} -> {sense.definition()}\n")
    else:
        print("No sense found.\n")


Sentence: I went to the bank to deposit my paycheck.
Predicted sense: depository_financial_institution.n.01 -> a financial institution that accepts deposits and channels the money into lending activities

Sentence: The river overflowed the bank after the heavy rain.
Predicted sense: bank.n.01 -> sloping land (especially the slope beside a body of water)

Sentence: He sat on the bank and watched the ducks.
Predicted sense: bank.n.01 -> sloping land (especially the slope beside a body of water)

Sentence: I need to charge my phone
Predicted sense: charge.v.24 -> energize a battery by passing a current through it in the direction opposite to discharge

Sentence: The soldier received a charge by the court
Predicted sense: charge.v.12 -> pay with a credit card; pay with plastic money; postpone payment by recording a purchase as a debt



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk import word_tokenize, pos_tag
import re

def get_context_window(sentence, target_idx, window=3):
    tokens = word_tokenize(sentence)
    left = max(0, target_idx - window)
    right = min(len(tokens), target_idx + window + 1)
    return tokens[left:right]

def find_target_index(sentence, target_word):
    tokens = word_tokenize(sentence)
    for i,t in enumerate(tokens):
        if re.fullmatch(re.escape(target_word), t, flags=re.IGNORECASE):
            return i
    for i,t in enumerate(tokens):
        if t.lower().startswith(target_word.lower()):
            return i
    return 0

def train_evaluate(df, test_size=0.2, random_state=42, window=3):
    X_sentences = df['sentence'].tolist()
    targets = df['target_word'].tolist()
    y = df['sense'].astype(str).tolist()

    # Extract context windows
    context_texts = []
    for sent, target in zip(X_sentences, targets):
        idx = find_target_index(sent, target)
        context_texts.append(" ".join(get_context_window(sent, idx, window=window)))

    vectorizer = CountVectorizer(max_features=2000)
    X_vec = vectorizer.fit_transform(context_texts)

    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=test_size,
                                                        random_state=random_state,
                                                        stratify=y if len(set(y))>1 else None)

    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    return clf, vectorizer

# Example dataset
data = [
    ("I went to the bank to deposit my paycheck.", "bank", "bank_financial"),
    ("The river overflowed the bank after the heavy rain.", "bank", "bank_river"),
    ("He sat on the bank and watched the ducks.", "bank", "bank_river"),
    ("She opened an account at the bank this morning.", "bank", "bank_financial"),
    ("The boat was pulled up onto the river bank.", "bank", "bank_river"),
    ("He withdrew cash from the bank ATM.", "bank", "bank_financial"),
    ("We picnicked on the grassy bank by the river.", "bank", "bank_river"),
]
df = pd.DataFrame(data, columns=['sentence','target_word','sense'])
clf, vect = train_evaluate(df, test_size=0.4, window=2)


Accuracy: 0.6666666666666666
Classification Report:
                precision    recall  f1-score   support

bank_financial       0.00      0.00      0.00         1
    bank_river       0.67      1.00      0.80         2

      accuracy                           0.67         3
     macro avg       0.33      0.50      0.40         3
  weighted avg       0.44      0.67      0.53         3

