In [1]:
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk import word_tokenize, WordNetLemmatizer
from nltk.classify.util import accuracy
import random

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Sample training data with contexts and senses
data = [
    ("The bank of the river was beautiful", "river"),
    ("He went to the bank to deposit money", "finance"),
    ("She sat on the river bank", "river"),
    ("He is working at the financial bank", "finance"),
    ("The boat was near the river bank", "river"),
    ("She withdrew cash from the bank", "finance")
]

# Preprocessing function to extract features
def extract_features(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    return {word: True for word in words}

# Create feature sets for training
feature_sets = [(extract_features(context), sense) for (context, sense) in data]

# Shuffle and split the data into training and test sets
random.shuffle(feature_sets)
train_set, test_set = feature_sets[:4], feature_sets[4:]

# Train the Naïve Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print(f'Accuracy: {accuracy(classifier, test_set):.2f}')
classifier.show_most_informative_features()

# Sample prediction
new_context = "He likes to fish by the bank"
features = extract_features(new_context)
predicted_sense = classifier.classify(features)
print(f"The predicted sense for '{new_context}' is '{predicted_sense}'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 1.00
Most Informative Features
               beautiful = None           financ : river  =      1.7 : 1.0
                    cash = None            river : financ =      1.7 : 1.0
                 deposit = None            river : financ =      1.7 : 1.0
                      he = None            river : financ =      1.7 : 1.0
                   money = None            river : financ =      1.7 : 1.0
                     sat = None           financ : river  =      1.7 : 1.0
                     the = None           financ : river  =      1.7 : 1.0
                    went = None            river : financ =      1.7 : 1.0
                withdrew = None            river : financ =      1.7 : 1.0
                    bank = True           financ : river  =      1.0 : 1.0
The predicted sense for 'He likes to fish by the bank' is 'finance'


In [3]:
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk import word_tokenize, WordNetLemmatizer
from nltk.classify.util import accuracy
import random

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocessing function to extract features
def extract_features(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    return {word: True for word in words}

# Read the training data from the file
training_data = []
with open("E://126156048/leb_3/training_set.txt", 'r') as file:
    for line in file:
        context, sense = line.strip().split('\t')
        training_data.append((context, sense))

# Create feature sets for training
feature_sets = [(extract_features(context), sense) for (context, sense) in training_data]

# Shuffle and split the data into training and test sets
random.shuffle(feature_sets)
train_set, test_set = feature_sets, feature_sets[:100]

# Train the Naïve Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print(f'Accuracy: {accuracy(classifier, test_set):.2f}')
classifier.show_most_informative_features()

# Sample prediction
new_context = "He likes to fish by the bank"
features = extract_features(new_context)
predicted_sense = classifier.classify(features)
print(f"The predicted sense for '{new_context}' is '{predicted_sense}'")


Accuracy: 1.00
Most Informative Features
                      he = None            river : financ =      1.4 : 1.0
                     she = None            river : financ =      1.3 : 1.0
                 account = None            river : financ =      1.2 : 1.0
                   along = None           financ : river  =      1.2 : 1.0
               financial = None            river : financ =      1.2 : 1.0
                     new = None            river : financ =      1.2 : 1.0
                  picnic = None           financ : river  =      1.2 : 1.0
                 service = None            river : financ =      1.2 : 1.0
                  beauty = None           financ : river  =      1.1 : 1.0
                customer = None            river : financ =      1.1 : 1.0
The predicted sense for 'He likes to fish by the bank' is 'finance'


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from nltk import bigrams

def extract_features_with_bigrams(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    word_features = {word: True for word in words}
    bigram_features = {f"{bigram[0]}_{bigram[1]}": True for bigram in bigrams(words)}
    return {**word_features, **bigram_features}

# Create feature sets with bigrams
feature_sets_with_bigrams = [(extract_features_with_bigrams(context), sense) for (context, sense) in training_data]

# Shuffle and split the data into training and test sets
random.shuffle(feature_sets_with_bigrams)
train_set, test_set = feature_sets_with_bigrams[:40], feature_sets_with_bigrams[40:]

# Train the Naïve Bayes classifier with bigrams
classifier_with_bigrams = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print(f'Accuracy with bigrams: {accuracy(classifier_with_bigrams, test_set):.2f}')
classifier_with_bigrams.show_most_informative_features()

# Sample prediction
features_with_bigrams = extract_features_with_bigrams(new_context)
predicted_sense_with_bigrams = classifier_with_bigrams.classify(features_with_bigrams)
print(f"The predicted sense for '{new_context}' with bigrams is '{predicted_sense_with_bigrams}'")


Accuracy with bigrams: 1.00
Most Informative Features
                the_bank = None            river : financ =      1.5 : 1.0
                      he = None            river : financ =      1.4 : 1.0
                 account = None            river : financ =      1.3 : 1.0
                     she = None            river : financ =      1.3 : 1.0
                   along = None           financ : river  =      1.2 : 1.0
             along_river = None           financ : river  =      1.2 : 1.0
               the_river = None           financ : river  =      1.2 : 1.0
              bank_offer = None            river : financ =      1.2 : 1.0
                  beauty = None           financ : river  =      1.2 : 1.0
               financial = None            river : financ =      1.2 : 1.0
The predicted sense for 'He likes to fish by the bank' with bigrams is 'finance'


In [5]:
from nltk import pos_tag

def extract_features_with_pos(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    pos_tags = pos_tag(words)
    
    # Consider only nouns, verbs, and adjectives for feature extraction
    relevant_words = [word for word, pos in pos_tags if pos.startswith('N') or pos.startswith('V') or pos.startswith('J')]
    
    word_features = {word: True for word in relevant_words}
    bigram_features = {f"{bigram[0]}_{bigram[1]}": True for bigram in bigrams(relevant_words)}
    
    return {**word_features, **bigram_features}

# Create feature sets with POS
feature_sets_with_pos = [(extract_features_with_pos(context), sense) for (context, sense) in training_data]

# Shuffle and split the data into training and test sets
random.shuffle(feature_sets_with_pos)
train_set, test_set = feature_sets_with_pos[:40], feature_sets_with_pos[40:]

# Train the Naïve Bayes classifier with POS features
classifier_with_pos = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print(f'Accuracy with POS: {accuracy(classifier_with_pos, test_set):.2f}')
classifier_with_pos.show_most_informative_features()

# Sample prediction
features_with_pos = extract_features_with_pos(new_context)
predicted_sense_with_pos = classifier_with_pos.classify(features_with_pos)
print(f"The predicted sense for '{new_context}' with POS is '{predicted_sense_with_pos}'")


Accuracy with POS: 1.00
Most Informative Features
                   river = None           financ : river  =      5.3 : 1.0
              river_bank = None           financ : river  =      5.3 : 1.0
                 account = None            river : financ =      1.3 : 1.0
           bank_provided = True            river : financ =      1.2 : 1.0
               financial = None            river : financ =      1.2 : 1.0
                     new = None            river : financ =      1.2 : 1.0
                provided = True            river : financ =      1.2 : 1.0
                 service = None            river : financ =      1.2 : 1.0
                  beauty = None           financ : river  =      1.2 : 1.0
                 enjoyed = None           financ : river  =      1.2 : 1.0
The predicted sense for 'He likes to fish by the bank' with POS is 'finance'


In [12]:
import random
from nltk import NaiveBayesClassifier, pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import bigrams
from nltk.classify import accuracy
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Updated and expanded training data with additional examples
expanded_training_data = [
    # River sense
    ("The children played by the river bank.", "river"),
    ("They set up a picnic by the river bank.", "river"),
    ("We spent the afternoon walking along the river bank.", "river"),
    ("He enjoys kayaking near the river bank every weekend.", "river"),
    ("The river bank was bustling with people fishing.", "river"),
    ("The river flooded and covered the bank with water.", "river"),
    ("We followed the river bank trail through the forest.", "river"),
    ("The boat was anchored by the river bank.", "river"),
    ("The river bank was a perfect spot for our tent.", "river"),
    ("Wildflowers grew along the river bank.", "river"),
    ("The river bank had eroded after the heavy rains.", "river"),
    
    # Finance sense
    ("I went to the bank to deposit a check.", "finance"),
    ("The bank approved my loan application.", "finance"),
    ("She worked as a teller at the local bank.", "finance"),
    ("They offer excellent financial services at this bank.", "finance"),
    ("You can open an account at any bank in town.", "finance"),
    ("The bank charges high interest rates on loans.", "finance"),
    ("Our local bank has a great mobile app.", "finance"),
    ("He withdrew cash from the bank.", "finance"),
    ("She has a meeting with the bank manager.", "finance"),
    ("The bank is closed on public holidays.", "finance"),
    ("They are opening a new bank branch downtown.", "finance"),
    ("She visited the bank to discuss her investment portfolio.", "finance"),
    ("The bank provided a financial report for the last quarter.", "finance"),
    ("The bank's new policy on loans is quite strict.", "finance"),
    ("He worked in a bank before starting his own business.", "finance"),
    ("The bank approved a loan application yesterday.", "finance"),
]

lemmatizer = WordNetLemmatizer()

def extract_features_with_pos(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    pos_tags = pos_tag(words)
    
    relevant_words = [word for word, pos in pos_tags if pos.startswith('N') or pos.startswith('V') or pos.startswith('J') or pos.startswith('R')]
    
    word_features = {word: True for word in relevant_words}
    bigram_features = {f"{bigram[0]}_{bigram[1]}": True for bigram in bigrams(relevant_words)}
    pos_features = {f"{word}_{pos}": True for word, pos in pos_tags}
    
    # Additional contextual features
    context_features = {
        'contains_fish': 'fish' in words,
        'contains_deposit': 'deposit' in words,
        'contains_loan': 'loan' in words,
        'contains_bank': 'bank' in words,
    }
    
    return {**word_features, **bigram_features, **pos_features, **context_features}

# Create feature sets with POS and additional features
feature_sets_with_pos = [(extract_features_with_pos(context), sense) for (context, sense) in expanded_training_data]

# Shuffle and split the data into training and test sets
random.shuffle(feature_sets_with_pos)
train_set, test_set = feature_sets_with_pos[:24], feature_sets_with_pos[24:]

# Train the Naïve Bayes classifier with POS and contextual features
classifier_with_pos = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print(f'Accuracy with POS and contextual features: {accuracy(classifier_with_pos, test_set):.2f}')
classifier_with_pos.show_most_informative_features()

# Sample prediction
new_context = "He likes to fish by the bank."
features_with_pos = extract_features_with_pos(new_context)
predicted_sense_with_pos = classifier_with_pos.classify(features_with_pos)
print(f"The predicted sense for '{new_context}' with POS and contextual features is '{predicted_sense_with_pos}'")

Accuracy with POS and contextual features: 1.00
Most Informative Features
                river_NN = None           financ : river  =      2.8 : 1.0
                  the_DT = True            river : financ =      1.4 : 1.0
           contains_loan = False           river : financ =      1.3 : 1.0
                    loan = None            river : financ =      1.3 : 1.0
                 loan_NN = None            river : financ =      1.3 : 1.0
                  the_DT = None           financ : river  =      1.3 : 1.0
                   along = None           financ : river  =      1.3 : 1.0
                along_RB = None           financ : river  =      1.3 : 1.0
             along_river = None           financ : river  =      1.3 : 1.0
                river_JJ = None           financ : river  =      1.3 : 1.0
The predicted sense for 'He likes to fish by the bank.' with POS and contextual features is 'river'


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import random
from nltk import NaiveBayesClassifier, pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import bigrams
from nltk.classify import accuracy
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load training data from a text file
def load_training_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    data = [(line.rsplit(' ', 1)[0], line.rsplit(' ', 1)[1].strip()) for line in lines]
    return data

# Example file path (adjust as needed)
file_path = 'E://126156048/leb_3/training_set.txt'
training_data = load_training_data(file_path)

lemmatizer = WordNetLemmatizer()

def extract_features_with_pos(sentence):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    pos_tags = pos_tag(words)
    
    relevant_words = [word for word, pos in pos_tags if pos.startswith('N') or pos.startswith('V') or pos.startswith('J') or pos.startswith('R')]
    
    word_features = {word: True for word in relevant_words}
    bigram_features = {f"{bigram[0]}_{bigram[1]}": True for bigram in bigrams(relevant_words)}
    pos_features = {f"{word}_{pos}": True for word, pos in pos_tags}
    
    # Additional contextual features
    context_features = {
        'contains_fish': 'fish' in words,
        'contains_deposit': 'deposit' in words,
        'contains_loan': 'loan' in words,
        'contains_bank': 'bank' in words,
    }
    
    return {**word_features, **bigram_features, **pos_features, **context_features}

# Create feature sets with POS and additional features
feature_sets_with_pos = [(extract_features_with_pos(context), sense) for (context, sense) in training_data]

# Shuffle and split the data into training and test sets
random.shuffle(feature_sets_with_pos)
train_set, test_set = feature_sets_with_pos[:24], feature_sets_with_pos[24:]

# Train the Naïve Bayes classifier with POS and contextual features
classifier_with_pos = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print(f'Accuracy with POS and contextual features: {accuracy(classifier_with_pos, test_set):.2f}')
classifier_with_pos.show_most_informative_features()

# Sample prediction
new_context = "He likes to fish by the bank."
features_with_pos = extract_features_with_pos(new_context)
predicted_sense_with_pos = classifier_with_pos.classify(features_with_pos)
print(f"The predicted sense for '{new_context}' with POS and contextual features is '{predicted_sense_with_pos}'")

Accuracy with POS and contextual features: 1.00
Most Informative Features
              river_bank = None           financ : river  =      6.5 : 1.0
                river_NN = None           financ : river  =      3.9 : 1.0
                  the_DT = True            river : financ =      1.9 : 1.0
                  the_DT = None           financ : river  =      1.9 : 1.0
           contains_loan = False           river : financ =      1.3 : 1.0
                    loan = None            river : financ =      1.3 : 1.0
                 loan_NN = None            river : financ =      1.3 : 1.0
                  we_PRP = None           financ : river  =      1.3 : 1.0
                 she_PRP = None            river : financ =      1.2 : 1.0
               afternoon = None           financ : river  =      1.1 : 1.0
The predicted sense for 'He likes to fish by the bank.' with POS and contextual features is 'finance'


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Define the file path
file_path = 'E://126156048/leb_3/training_set.txt'

# Open the file and read lines
with open(file_path, 'r') as file:
    # Create a list of tuples from each line
    data = [tuple(line.strip().split(' ', 1)) for line in file]

# Print the result
print(data)

[('The', 'children played by the river bank. river'), ('They', 'set up a picnic by the river bank. river'), ('We', 'spent the afternoon walking along the river bank. river'), ('He', 'enjoys kayaking near the river bank every weekend. river'), ('The', 'river bank was bustling with people fishing. river'), ('The', 'river flooded and covered the bank with water. river'), ('We', 'followed the river bank trail through the forest. river'), ('The', 'boat was anchored by the river bank. river'), ('The', 'river bank was a perfect spot for our tent. river'), ('Wildflowers', 'grew along the river bank. river'), ('The', 'river bank had eroded after the heavy rains. river'), ('I', 'went to the bank to deposit a check. finance'), ('The', 'bank approved my loan application. finance'), ('She', 'worked as a teller at the local bank. finance'), ('They', 'offer excellent financial services at this bank. finance'), ('You', 'can open an account at any bank in town. finance'), ('The', 'bank charges high int