In [48]:
import pandas as pd

In [50]:
df=pd.read_csv("dictionary.csv")

In [52]:
df.head()

Unnamed: 0,Word,POS,Definition
0,A,,The first letter of the English and of many ot...
1,A,,The name of the sixth tone in the model major ...
2,A,,An adjective commonly called the indefinite ar...
3,A,,"In each; to or for each; as """"""""twenty leagues..."
4,A,prep.,In; on; at; by.


In [54]:
# dropping unnecessary columns
df.drop('POS', axis=1, inplace=True)

In [56]:
df.head()

Unnamed: 0,Word,Definition
0,A,The first letter of the English and of many ot...
1,A,The name of the sixth tone in the model major ...
2,A,An adjective commonly called the indefinite ar...
3,A,"In each; to or for each; as """"""""twenty leagues..."
4,A,In; on; at; by.


In [58]:
df.shape

(175733, 2)

In [60]:
#checking missing values
df.isnull().sum()
# Fill missing values (NaN) with the string "NaN"
df = df.fillna('NaN')


In [62]:
# checking duplicates
df.duplicated().sum()

29

In [64]:
# Remove duplicate rows
df1= df.drop_duplicates()


In [66]:
df1.shape

(175704, 2)

In [68]:
df1.head()

Unnamed: 0,Word,Definition
0,A,The first letter of the English and of many ot...
1,A,The name of the sixth tone in the model major ...
2,A,An adjective commonly called the indefinite ar...
3,A,"In each; to or for each; as """"""""twenty leagues..."
4,A,In; on; at; by.


In [70]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 175704 entries, 0 to 175723
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Word        175704 non-null  object
 1   Definition  175704 non-null  object
dtypes: object(2)
memory usage: 4.0+ MB


In [75]:
df1.to_csv('cleaned_dataset.csv', index=False)


In [None]:
ACCURACY

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
from nltk.stem import PorterStemmer

# Initialize the stemmer
stemmer = PorterStemmer()

# Load the dictionary CSV file
def load_data():
    if os.path.exists('dictionary.csv'):
        data = pd.read_csv('dictionary.csv')
    else:
        data = pd.DataFrame(columns=['Word', 'Definition'])
    return data

# Save the dictionary back to CSV
def save_data(data):
    data.to_csv('dictionary.csv', index=False)

# Clean dataset and apply stemming to the words
data = load_data()
words = data['Word'].fillna('').str.lower().values  # convert words to lowercase
definitions = data['Definition'].fillna('').values

# Apply stemming to the words
stemmed_words = [stemmer.stem(word) for word in words]

# Vectorize the words for similarity search
vectorizer = TfidfVectorizer()
word_vectors = vectorizer.fit_transform(stemmed_words)

# Define a function to handle word definitions
def get_word_definition(input_word):
    input_word_lower = input_word.lower()  # Lowercase the input for consistency
    input_word_stemmed = stemmer.stem(input_word_lower)

    # Exact match check (case-insensitive)
    if input_word_lower in words:
        idx = list(words).index(input_word_lower)
        return words[idx], definitions[idx]

    # Cosine similarity search if no exact match is found
    input_vector = vectorizer.transform([input_word_stemmed])
    similarities = cosine_similarity(input_vector, word_vectors).flatten()

    # Find the best match based on cosine similarity
    best_match_idx = similarities.argmax()

    # Set a similarity threshold to filter out irrelevant matches
    similarity_threshold = 0.3  # You can adjust this threshold
    if similarities[best_match_idx] >= similarity_threshold:
        return words[best_match_idx], definitions[best_match_idx]

    # No suitable match found
    return None, None

# Function to add a new word to the dictionary
def add_word(word, definition):
    new_entry = pd.DataFrame({'Word': [word], 'Definition': [definition]})
    global data, words, definitions, stemmed_words, word_vectors

    data = pd.concat([data, new_entry], ignore_index=True)
    save_data(data)

    # Recompute the vectorizer with the new dataset
    words = data['Word'].str.lower().values  # convert all words to lowercase
    definitions = data['Definition'].values

    # Reapply stemming and vectorization after adding the new word
    stemmed_words = [stemmer.stem(word) for word in words]
    word_vectors = vectorizer.fit_transform(stemmed_words)

# Example usage
if __name__ == '__main__':
    while True:
        action = input("Enter 'define' to look up a word, 'add' to add a word, or 'exit' to quit: ").strip().lower()
        if action == 'define':
            word_to_define = input("Enter the word to define: ").strip()
            retrieved_word, retrieved_definition = get_word_definition(word_to_define)
            if retrieved_definition:
                print(f"{retrieved_word}: {retrieved_definition}")
            else:
                print("Word not found.")
        elif action == 'add':
            new_word = input("Enter the word to add: ").strip()
            new_definition = input("Enter the definition: ").strip()
            add_word(new_word, new_definition)
            print(f"The word '{new_word}' has been added.")
        elif action == 'exit':
            break
        else:
            print("Invalid option. Please try again.")


Enter 'define' to look up a word, 'add' to add a word, or 'exit' to quit:  define
Enter the word to define:  man


man: A human being; -- opposed tobeast.


Enter 'define' to look up a word, 'add' to add a word, or 'exit' to quit:  exit


In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
# from dictionary_bot import get_word_definition  # Import the function from your main application

# Function to load dictionary from CSV
def load_dictionary_from_csv():
    if os.path.exists('dictionary.csv'):
        data = pd.read_csv('dictionary.csv')
        return data
    return pd.DataFrame(columns=['Word', 'Definition'])

# Load all data from CSV
data = load_dictionary_from_csv()

# Drop rows with NaN values to avoid issues
data.dropna(inplace=True)

# Split the data into training and testing sets (80% training, 20% testing)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Create a dictionary for easy access to test data
test_data_dict = dict(zip(test_data['Word'].str.lower(), test_data['Definition']))

def evaluate_model(test_data_dict):
    correct_predictions = 0
    total_predictions = len(test_data_dict)

    for word, expected_definition in test_data_dict.items():
        # Check if the key is a string
        if isinstance(word, str):
            retrieved_word, retrieved_definition = get_word_definition(word)
            
            # Check if the retrieved definition matches the expected definition
            if retrieved_definition and retrieved_definition.strip() == expected_definition.strip():
                correct_predictions += 1

    accuracy = (correct_predictions / total_predictions) * 100  # Calculate percentage
    print(f'Accuracy: {accuracy:.2f}%')

# Call the evaluation function with the test dataset
evaluate_model(test_data_dict)


Accuracy: 71.84%
