# Basic Probability Distribution Example
Here's a very basic example showing how to calculate and display the probability distribution of words in a text.

In [3]:
from collections import Counter
import nltk

nltk.download('punkt')

text = "Natural language processing enables computers to understand human language."
words = nltk.word_tokenize(text.lower())

# Count the frequency of each word
freq = Counter(words)

# Calculate probabilities
total_words = sum(freq.values())
prob_dist = {word: count/total_words for word, count in freq.items()}

print(prob_dist)


{'natural': 0.1, 'language': 0.2, 'processing': 0.1, 'enables': 0.1, 'computers': 0.1, 'to': 0.1, 'understand': 0.1, 'human': 0.1, '.': 0.1}


# Conditional Probability in Bigram Model
This example demonstrates how to calculate conditional probabilities using a bigram model.

In [4]:
from nltk import bigrams, ConditionalFreqDist

sentence = 'the quick brown fox jumps over the lazy dog'
words = sentence.split()
bigram = list(bigrams(words))
cfdist = ConditionalFreqDist(bigram)

# Calculate conditional probabilities
conditional_prob = {pair: cfdist[pair[0]].freq(pair[1]) for pair in bigram}

print(conditional_prob)


{('the', 'quick'): 0.5, ('quick', 'brown'): 1.0, ('brown', 'fox'): 1.0, ('fox', 'jumps'): 1.0, ('jumps', 'over'): 1.0, ('over', 'the'): 1.0, ('the', 'lazy'): 0.5, ('lazy', 'dog'): 1.0}


# Naive Bayes Classifier for Text Classification
Using Naive Bayes to classify text into categories.

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Load data
data = fetch_20newsgroups()
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

# Create pipeline
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train model
model.fit(train.data, train.target)

# Predict and evaluate
labels = model.predict(test.data)

# Check the performance
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test.target, labels)
print(mat)


[[371  11   2   5]
 [ 11 377   5   1]
 [  5   4 379  10]
 [  5  11  49 186]]


# Spam Detection Using Naive Bayes

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Simulated larger dataset
data = [
    ("Free money!!!", "spam"),
    ("Hi Bob, how about a game of golf tomorrow?", "ham"),
    ("Urgent: Please send your bank details to claim your inheritance", "spam"),
    ("Congratulations! You've won a $1000 gift card. Click here.", "spam"),
    ("Can we meet for lunch today?", "ham"),
    ("Hello, I love your blog post on Naive Bayes.", "ham"),
    ("This is not spam, but we are selling cheap Viagra", "spam"),
    ("Good morning, are you available for a meeting today?", "ham"),
    ("Win big with our new lottery system", "spam"),
    ("Please read our new research paper on machine learning", "ham"),
    ("Lowest price drugs for you", "spam"),
    ("Hey, can you send me the report?", "ham"),
    ("You have won free tickets to the concert", "spam"),
    ("Meeting reminder for tomorrow", "ham"),
    ("Get your free trial of our product now!", "spam"),
    ("How about dinner tonight?", "ham"),
    ("You are a lucky winner of our contest", "spam"),
    ("Could you please help me with the project?", "ham"),
    ("Unbeatable prices for branded watches", "spam"),
    ("Do you want to go out this weekend?", "ham"),
    ("Big sale on all electronics!", "spam"),
]

# Split data into messages and labels
messages, labels = zip(*data)

# Convert text data into numerical data
vectorizer = CountVectorizer()
message_features = vectorizer.fit_transform(messages)

# Split data for training and testing (smaller test size ratio)
msg_train, msg_test, label_train, label_test = train_test_split(message_features, labels, test_size=0.15, random_state=42)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(msg_train, label_train)

# Predict on test data
predictions = classifier.predict(msg_test)

# Evaluate the model
print("Accuracy:", accuracy_score(label_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(label_test, predictions))
print("Classification Report:\n", classification_report(label_test, predictions))


Accuracy: 1.0
Confusion Matrix:
 [[3 0]
 [0 1]]
Classification Report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00         3
        spam       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

