In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

# Training data
train_data = [
    (dict([(word, True) for word in word_tokenize('fish smoked fish')]), 'f'),
    (dict([(word, True) for word in word_tokenize('fish line')]), 'f'),
    (dict([(word, True) for word in word_tokenize('fish haul smoked')]), 'f'),
    (dict([(word, True) for word in word_tokenize('guitar jazz line')]), 'g')
]

# Train the classifier
classifier = NaiveBayesClassifier.train(train_data)

# Test data
test_data = word_tokenize('line guitar jazz jazz')
test_features = dict([(word, True) for word in test_data])

# Predict the class for the test document
predicted_class = classifier.classify(test_features)
print(test_data)
print(f'The predicted class for the test document is: {predicted_class}')

# Show the most informative features
classifier.show_most_informative_features()


['line', 'guitar', 'jazz', 'jazz']
The predicted class for the test document is: g
Most Informative Features
                    line = True                g : f      =      2.0 : 1.0
                  smoked = None                g : f      =      2.0 : 1.0
                    haul = None                g : f      =      1.2 : 1.0


In [8]:
from collections import Counter
import math

# Vocabulary (V)
vocab = ['fish', 'smoked', 'line', 'haul', 'guitar', 'jazz']
V = len(vocab)

# Training data (using word counts)
train_data = {
    'f': ['fish smoked fish', 'fish line', 'fish haul smoked'],
    'g': ['guitar jazz line'],
    'h': ['line guitar jazz']
}

# Priors
N = sum(len(doc.split()) for docs in train_data.values() for doc in docs)  # Total number of words
N_f = sum(len(doc.split()) for doc in train_data['f'])  # Number of words in class 'f'
N_g = sum(len(doc.split()) for doc in train_data['g'])  # Number of words in class 'g'

P_f = N_f / N  # Prior for class 'f'
P_g = N_g / N  # Prior for class 'g'

# Function to calculate conditional probabilities
def conditional_probability(word, class_docs, total_class_words):
    word_count = sum(doc.split().count(word) for doc in class_docs)
    return (word_count + 1) / (total_class_words + V)  # Additive smoothing

# Conditional probabilities for class 'f'
P_line_f = conditional_probability('line', train_data['f'], N_f)
P_guitar_f = conditional_probability('guitar', train_data['f'], N_f)
P_jazz_f = conditional_probability('jazz', train_data['f'], N_f)

# Conditional probabilities for class 'g'
P_line_g = conditional_probability('line', train_data['g'], N_g)
P_guitar_g = conditional_probability('guitar', train_data['g'], N_g)
P_jazz_g = conditional_probability('jazz', train_data['g'], N_g)

# Test document
test_doc = 'line guitar jazz jazz'.split()

# Calculate the probabilities for each class
P_d5_f = P_f * P_line_f * P_guitar_f * P_jazz_f * P_jazz_f
P_d5_g = P_g * P_line_g * P_guitar_g * P_jazz_g * P_jazz_g

# Choosing the class with the highest probability
if P_d5_f > P_d5_g:
    predicted_class = 'f'
else:
    predicted_class = 'g'

# Output results
print(f"P(d5|f) ≈ {P_d5_f}")
print(f"P(d5|g) ≈ {P_d5_g}")
print(f"The predicted class for the test document is: {predicted_class}")


P(d5|f) ≈ 2.9749509133099296e-05
P(d5|g) ≈ 0.0005225684238029916
The predicted class for the test document is: g


In [9]:
import math
from collections import defaultdict, Counter
from nltk import word_tokenize

# Sample training data (documents and their respective classes)
training_data = [
    ("The team won the football match.", "sports"),
    ("The government passed a new law.", "politics"),
    ("The latest smartphone has great features.", "technology"),
    ("The player scored a goal in the match.", "sports"),
    ("The senator gave a speech on healthcare.", "politics"),
    ("The new software update improves security.", "technology")
]

# Feature extraction function (tokenizing words)
def extract_features(text):
    words = word_tokenize(text.lower())
    return words

# Naive Bayes Document Classifier
class NaiveBayesClassifier:
    def __init__(self, training_data):
        self.feature_prob = defaultdict(lambda: defaultdict(float))
        self.class_prob = defaultdict(float)
        self.train(training_data)

    def train(self, training_data):
        # Calculate prior probabilities for classes
        class_counts = Counter()
        feature_counts = defaultdict(Counter)

        for document, category in training_data:
            features = extract_features(document)
            class_counts[category] += 1
            for feature in features:
                feature_counts[category][feature] += 1

        total_documents = sum(class_counts.values())
        self.class_prob = {category: count / total_documents for category, count in class_counts.items()}

        # Calculate feature probabilities P(feature|category)
        for category, features in feature_counts.items():
            total_features = sum(features.values())
            self.feature_prob[category] = {feature: (count / total_features) for feature, count in features.items()}

    def classify(self, document):
        features = extract_features(document)
        max_prob = float('-inf')
        best_class = None

        for category in self.class_prob:
            log_prob = math.log(self.class_prob[category])
            for feature in features:
                if feature in self.feature_prob[category]:
                    log_prob += math.log(self.feature_prob[category][feature])
                else:
                    # Apply Laplace smoothing for unseen features
                    log_prob += math.log(1 / (sum(self.feature_prob[category].values()) + len(features)))

            if log_prob > max_prob:
                max_prob = log_prob
                best_class = category

        return best_class

# Train the model
nb_classifier = NaiveBayesClassifier(training_data)

# Test the model with a new document
test_document = "The latest update includes new features for security."
predicted_class = nb_classifier.classify(test_document)
print(f"Predicted class for '{test_document}': {predicted_class}")

# Calculate and display the conditional probabilities for comparison
features = extract_features(test_document)
for category in nb_classifier.class_prob:
    log_prob = math.log(nb_classifier.class_prob[category])
    print(f"\nClass: {category}")
    for feature in features:
        if feature in nb_classifier.feature_prob[category]:
            prob = nb_classifier.feature_prob[category][feature]
            log_prob += math.log(prob)
            print(f"  P({feature}|{category}) = {prob:.5f}")
        else:
            prob = 1 / (sum(nb_classifier.feature_prob[category].values()) + len(features))
            log_prob += math.log(prob)
            print(f"  P({feature}|{category}) (Laplace smoothed) = {prob:.5f}")
    print(f"  Log-probability: {log_prob:.5f}")

Predicted class for 'The latest update includes new features for security.': sports

Class: sports
  P(the|sports) = 0.25000
  P(latest|sports) (Laplace smoothed) = 0.10000
  P(update|sports) (Laplace smoothed) = 0.10000
  P(includes|sports) (Laplace smoothed) = 0.10000
  P(new|sports) (Laplace smoothed) = 0.10000
  P(features|sports) (Laplace smoothed) = 0.10000
  P(for|sports) (Laplace smoothed) = 0.10000
  P(security|sports) (Laplace smoothed) = 0.10000
  P(.|sports) = 0.12500
  Log-probability: -20.68244

Class: politics
  P(the|politics) = 0.13333
  P(latest|politics) (Laplace smoothed) = 0.10000
  P(update|politics) (Laplace smoothed) = 0.10000
  P(includes|politics) (Laplace smoothed) = 0.10000
  P(new|politics) = 0.06667
  P(features|politics) (Laplace smoothed) = 0.10000
  P(for|politics) (Laplace smoothed) = 0.10000
  P(security|politics) (Laplace smoothed) = 0.10000
  P(.|politics) = 0.13333
  Log-probability: -21.65198

Class: technology
  P(the|technology) = 0.14286
  P(la

In [14]:
import math
from collections import defaultdict

# Given data
data = {
    1: ['Bass', 'eat', 'super'],
    2: ['Bass', 'lunch', 'excellent'],
    3: ['Bass', 'ate', 'like'],
    4: ['guitar', 'play', 'music'],
    5: ['Bass', 'interest', 'pay','line'],
    6: ['guitar','play','melody'],
    7: ['fish', 'haul','line']
}

# Corresponding classes (senses)
classes = {
    1: 'fish',
    2: 'fish',
    3: 'fish',
    4: 'instrument',
    5: 'music',
    6: 'instrument',
    7: 'fish'
}

# 1) Calculate priors
class_counts = defaultdict(int)
for cls in classes.values():
    class_counts[cls] += 1

# Number of documents
N = len(classes)

# Calculate prior probabilities
priors = {cls: count / N for cls, count in class_counts.items()}

# 2) Calculate the conditional probability of each word with each class
word_counts = defaultdict(lambda: defaultdict(int))

for idx, words in data.items():
    cls = classes[idx]
    for word in words:
        word_counts[cls][word] += 1

# Total words per class
total_words_per_class = {cls: sum(counts.values()) for cls, counts in word_counts.items()}

# Calculate conditional probabilities
conditional_probabilities = defaultdict(dict)

for cls, words in word_counts.items():
    for word, count in words.items():
        # Use Laplace smoothing
        conditional_probabilities[cls][word] = (count + 1) / (total_words_per_class[cls] + len(word_counts[cls]))

# 3) Define the target words and find v (count of words in to-be-found case/test case)
target_words = ['Bass', 'haul', 'line']

# 4) Score calculation
scores = defaultdict(float)

# Calculate scores for each class
for cls in priors.keys():
    scores[cls] = math.log(priors[cls])  # Initialize with log prior

    for word in target_words:
        vj = word.lower()  # Convert word to lower case for comparison
        if vj in conditional_probabilities[cls]:
            scores[cls] += math.log(conditional_probabilities[cls][vj])
        else:
            # If the word is not found, assume a small probability (Laplace smoothing)
            scores[cls] += math.log(1 / (total_words_per_class[cls] + len(word_counts[cls])))

# Determine the class with the highest score
predicted_class = max(scores, key=scores.get)

# Output the results
print(f"Scores: {scores}")
print(f"Predicted Class for target words {target_words}: {predicted_class}")


Scores: defaultdict(<class 'float'>, {'fish': -8.44644878689048, 'instrument': -8.160518247477505, 'music': -7.491087593534877})
Predicted Class for target words ['Bass', 'haul', 'line']: music


In [16]:
import math
from collections import defaultdict
from nltk import word_tokenize

# Given data
data = {
    1: ['bass', 'eat', 'amount'],
    2: ['bass', 'lunch', 'excellent'],
    3: ['bass', 'ate','like'],
    4: ['guitar', 'play', 'music'],
    5: ['money', 'interest', 'pay','amount'],
    6: ['guitar','interest','melody'],
    7: ['fish', 'haul','line'],
    8: ['guitar','like','play'],
    9: ['rate']
}

# Corresponding classes (senses)
classes = {
    1: 'fish',
    2: 'fish',
    3: 'fish',
    4: 'instrument',
    5: 'finance',
    6: 'instrument',
    7: 'fish',
    8: 'instrument',
    9: 'finance'
}

# 1) Calculate priors
class_counts = defaultdict(int)
for cls in classes.values():
    class_counts[cls] += 1

# Number of documents
N = len(classes)

# Calculate prior probabilities
priors = {cls: count / N for cls, count in class_counts.items()}

# 2) Calculate the conditional probability of each word with each class
word_counts = defaultdict(lambda: defaultdict(int))

for idx, words in data.items():
    cls = classes[idx]
    for word in words:
        word_counts[cls][word.lower()] += 1  # Convert words to lowercase for consistency

# Total words per class
total_words_per_class = {cls: sum(counts.values()) for cls, counts in word_counts.items()}

# Full vocabulary size
vocab = set(word.lower() for words in data.values() for word in words)
V = len(vocab)

# Calculate conditional probabilities
conditional_probabilities = defaultdict(dict)

for cls, words in word_counts.items():
    for word in vocab:
        count = words[word]  # This will be 0 if the word isn't in the class
        conditional_probabilities[cls][word] = (count + 1) / (total_words_per_class[cls] + V)

# 3) Define the target words and find v (count of words in to-be-found case/test case)
x = input("ENTER target words:")
target_words = word_tokenize(x)
# 4) Score calculation
scores = defaultdict(float)

# Calculate scores for each class
for cls in priors.keys():
    scores[cls] = math.log(priors[cls])  # Initialize with log prior

    for word in target_words:
        vj = word.lower()  # Convert word to lower case for comparison
        if vj in conditional_probabilities[cls]:
            scores[cls] += math.log(conditional_probabilities[cls][vj])
        else:
            # If the word is not found, assume a small probability (Laplace smoothing)
            scores[cls] += math.log(1 / (total_words_per_class[cls] + V))

# Determine the class with the highest score
predicted_class = max(scores, key=scores.get)

# Output the results
print(f"Scores: {scores}")
print(f"Predicted Class for target words {target_words}: {predicted_class}")


ENTER target words: guitar bass melody interest pay rate play


Scores: defaultdict(<class 'float'>, {'fish': -23.233017526731523, 'instrument': -20.29826933979052, 'finance': -21.373095366600488})
Predicted Class for target words ['guitar', 'bass', 'melody', 'interest', 'pay', 'rate', 'play']: instrument
