In [18]:
import re
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import tkinter as tk
from sklearn.model_selection import train_test_split
import json

In [19]:
def load_dataset():
    with open('Dataset.json', 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    # Extract the necessary information from the dataset
    data = []
    for item in dataset:
        word = item['word']
        translation = item['translation']
        senses = item['senses']
        meanings = [item['disambiguation']]
        data.append({'Word': word, 'Translation': translation, 'Sense': senses, 'Meaning': '; '.join(meanings)})

    return pd.DataFrame(data)

In [20]:
# Preprocessing Functions
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Zಀ-\u25FF\u2600-\u26FF\u2700-\u27BF]', ' ', text)

    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [21]:
# Feature Extraction Functions
def extract_features(word):
    # Example feature extraction: Part-of-speech tagging
    pos_tags = pos_tag([word])
    features = {}
    if len(pos_tags) > 0:
        features['POS'] = pos_tags[0][1]
    return features


In [22]:
# Rule-Based Disambiguation
def rule_based_disambiguate_word(word, dataset, features):
    # Iterate through the dataset and check for word matches
    for index, row in dataset.iterrows():
        if row['Word'] == word or row['Translation'] == word:
            # Check if all the features match
            if all(feature in row for feature in features.values()):
                return row['Meaning']

    # If no match is found, return None
    return None

In [23]:
# Model Training
def train_model(dataset):
    # Preprocess the dataset
    dataset['Preprocessed'] = dataset['Meaning'].apply(preprocess_text)

    # Extract features using TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(dataset['Preprocessed'])
    y = dataset['Sense']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the LinearSVC model
    model = LinearSVC()
    model.fit(X_train, y_train)

    # Calculate accuracy on the testing set
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy: {accuracy}")

    return model, vectorizer

In [24]:
# Model Prediction
def predict_sense(text, dataset, model, vectorizer):
    preprocessed_text = preprocess_text(text)
    words = preprocessed_text.split()
    word_senses = {}
    ambiguous_words = {}

    for word in words:
        features = extract_features(word)
        sense = rule_based_disambiguate_word(word, dataset, features)
        if sense is None:
            X = vectorizer.transform([preprocessed_text])
            predicted_senses = model.predict(X)

            word_senses[word] = predicted_senses[0]
            if word not in ambiguous_words:
                ambiguous_words[word] = {'count': 0, 'meanings': []}
            word_meanings = dataset.loc[(dataset['Word'] == word) & (dataset['Sense'].isin(predicted_senses)), 'Meaning'].unique()
            if len(word_meanings) > 0:
                ambiguous_words[word]['count'] += 1
                ambiguous_words[word]['meanings'].extend(list(word_meanings))

    # Remove words from ambiguous_words that have count = 0
    ambiguous_words = {word: data for word, data in ambiguous_words.items() if data['count'] > 0}

    # Remove word_senses for words not in the dataset
    word_senses = {word: sense for word, sense in word_senses.items() if word in dataset['Word'].values}

    return word_senses, ambiguous_words

In [25]:
# User Interface Functions
def disambiguate(output_text):
    global input_text, dataset, model, vectorizer

    text = input_text.get("1.0", "end").strip()
    if text:
        word_senses, ambiguous_words = predict_sense(text, dataset, model, vectorizer)
        ambiguous_words_count = 0
        ambiguous_words_list = []

        for word, data in ambiguous_words.items():
            count = data['count']
            word_meanings = data['meanings']

            # Check if the word has any valid meanings in the dataset
            valid_meanings = [meaning for meaning in word_meanings if meaning != 'Meaning not found']
            if valid_meanings:
                ambiguous_words_count += 1
                ambiguous_words_list.append(word)

        # Display ambiguous word senses count
        output_text.insert(tk.END, f"Ambiguous words count: {ambiguous_words_count}\n\n")

        # Display ambiguous word senses and their meanings
        for word in ambiguous_words_list:
            data = ambiguous_words[word]
            count = data['count']
            word_meanings = data['meanings']
            output_text.insert(tk.END, f"Word: {word}\n")
            # output_text.insert(tk.END, f"Senses count: {count}\n")
            output_text.insert(tk.END, "Meanings:\n")
            output_text.insert(tk.END, "\n".join([f"{i+1}. {meaning}" for i, meaning in enumerate(word_meanings)]))
            output_text.insert(tk.END, "\n\n")

        if ambiguous_words_count == 0:
            output_text.insert(tk.END, "No ambiguous words found\n\n")

        # Print disambiguated meanings
        for word, sense in word_senses.items():
            if word not in ambiguous_words_list:
                meanings = dataset.loc[dataset['Word'] == word, 'Meaning'].values
                output_text.insert(tk.END, f"Word: {word}\n")
                # output_text.insert(tk.END, f"Senses count: {sense}\n")
                output_text.insert(tk.END, "Meanings:\n")
                output_text.insert(tk.END, "\n".join([f"{i+1}. {meaning}" for i, meaning in enumerate(meanings)]))
                output_text.insert(tk.END, "\n\n")

In [26]:
def measure_performance(dataset, model, vectorizer, output_text):
    # Prepare the test data
    test_data = dataset.sample(frac=0.2, random_state=42)  # Use 20% of the dataset for testing
    test_X = vectorizer.transform(test_data['Preprocessed'])
    test_y = test_data['Sense']

    # Predict the senses
    predicted_y = model.predict(test_X)

    # Calculate and display the classification report
    report = classification_report(test_y, predicted_y)
    output_text.insert(tk.END, "Classification Report:\n")
    output_text.insert(tk.END, report)
    output_text.insert(tk.END, "\n")

    # Calculate and display the accuracy
    accuracy = accuracy_score(test_y, predicted_y)
    output_text.insert(tk.END, f"Accuracy: {accuracy}\n")

def main():
    global input_text, dataset, model, vectorizer, output_text

    # Load the dataset
    dataset = load_dataset()

    # Train the machine learning model
    model, vectorizer = train_model(dataset)

    # Measure the performance
    root = tk.Tk()
    root.title("Word Sense Disambiguation")
    root.geometry("400x400")

    output_text = tk.Text(root, height=15)
    output_text.pack()

    measure_performance(dataset, model, vectorizer, output_text)

    label = tk.Label(root, text="Enter a sentence:")
    label.pack()

    input_text = tk.Text(root, height=5)
    input_text.pack()

    button = tk.Button(root, text="Disambiguate", command=lambda: disambiguate(output_text))
    button.pack()

    root.mainloop()

if __name__ == "__main__":
    main()

Model accuracy: 0.8676470588235294


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [28]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91901\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91901\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
 import nltk
 nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91901\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91901\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
nltk.download('omw-1.4')


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\91901\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [33]:
import nltk
print(nltk.data.path)


['C:\\Users\\91901/nltk_data', 'C:\\Users\\91901\\anaconda3\\nltk_data', 'C:\\Users\\91901\\anaconda3\\share\\nltk_data', 'C:\\Users\\91901\\anaconda3\\lib\\nltk_data', 'C:\\Users\\91901\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [34]:
import nltk
nltk.download('averaged_perceptron_tagger')
  

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\91901\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True