In [7]:
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def get_data(prompt):
    print(prompt)
    data = []
    while True:
        line = input().strip()
        if line.lower() == 'done':
            break
        if ',' in line:
            sentence, sense = line.rsplit(',', 1)
            data.append((sentence.strip(), sense.strip()))
        else:
            data.append((line.strip(), None))
    return data

def main():
    # Get data
    training_data = get_data("Enter training data (sentence and sense, separated by a comma). Type 'done' to finish:")
    testing_data = get_data("\nEnter testing data (sentence). Type 'done' to finish:")
    
    if not training_data:
        print("No training data provided. Exiting.")
        return
    
    # Prepare data
    train_sentences, train_labels = zip(*[(s, l) for s, l in training_data if l is not None])
    test_sentences = [s for s, l in testing_data if l is None]
    
    # Train model
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(train_sentences)
    clf = MultinomialNB()
    clf.fit(X_train, train_labels)
    
    # Predict and show details
    if test_sentences:
        X_test = vectorizer.transform(test_sentences)
        predicted_senses = clf.predict(X_test)
        
        print("\nPredictions:\n")
        for i, sentence in enumerate(test_sentences):
            print(f"Sentence: '{sentence}'\n")
            
            # Get feature log probabilities
            feature_log_probs = clf.feature_log_prob_
            vocab = vectorizer.vocabulary_
            words = vectorizer.get_feature_names_out()
            
            for j, sense in enumerate(clf.classes_):
                print(f"Probabilities for sense '{sense}':")
                prior = np.exp(clf.class_log_prior_[j])
                total_score = clf.class_log_prior_[j]
                
                for word in sentence.split():
                    if word in vocab:
                        idx = vocab[word]
                        prob = np.exp(feature_log_probs[j][idx])
                        word_score = feature_log_probs[j][idx]
                        print(f"P({word}|{sense}) = {prob:.4f}")
                    else:
                        prob = 1 / (np.sum(np.exp(feature_log_probs[j])) + len(words))
                        word_score = np.log(prob)
                        print(f"P({word}|{sense}) = UNSEEN -> {prob:.4f}")
                    total_score += word_score
                
                print(f"Prior P({sense}) = {prior:.4f}")
                print(f"Score for sense '{sense}': {total_score:.4f}\n")
            
            print(f"Predicted Sense: '{predicted_senses[i]}'\n")


main()




Enter training data (sentence and sense, separated by a comma). Type 'done' to finish:


 Bass eat super,Fish
 Bass lunch excellent,Fish
 Bass ate like,Fish
 Bass interest play,music
 Bass play music,music
 done



Enter testing data (sentence). Type 'done' to finish:


 bass super excellent play
 done



Predictions:

Sentence: 'bass super excellent play'

Probabilities for sense 'Fish':
P(bass|Fish) = 0.2105
P(super|Fish) = 0.1053
P(excellent|Fish) = 0.1053
P(play|Fish) = 0.0526
Prior P(Fish) = 0.6000
Score for sense 'Fish': -9.5160

Probabilities for sense 'music':
P(bass|music) = 0.1875
P(super|music) = 0.0625
P(excellent|music) = 0.0625
P(play|music) = 0.1875
Prior P(music) = 0.4000
Score for sense 'music': -9.8094

Predicted Sense: 'Fish'

