In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

# Load in Dataset

In [None]:
file_path = 'SMSSpamCollection'
sms_df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
sms_df['type'] = np.where(sms_df["label"]=='ham',0,1)

In [None]:
sms_df.head(20)

# Train and test Multinomial Naive Bayes Classifier

### Split Dataset into Train/Test Sets

In [None]:
# Split the dataset into training and test sets (70% training, 30% testing)
features = sms_df["message"]
target = sms_df["type"]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3)

In [None]:
# Convert text to TF-IDF features
tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Train on MultinomialNB

In [None]:
# Initialize the Multinomial Naive Bayes model
alp = 0.1
mnb = MultinomialNB(alpha=alp)

# Train the model
mnb.fit(X_train_tfidf, y_train)

### Predict Using Fitted Model

In [None]:
# Predict on the test set
y_pred_idf = mnb.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_idf)
class_report = classification_report(y_test, y_pred_idf)

# Print results
print("\nClassification Report:")
print(class_report)

# Analysis of the Model

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred_idf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam', 'Spam'], yticklabels=['Not Spam', 'Spam'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix (TF-IDF)')
plt.show()

### Precision-Recall Curve

In [None]:
precision_vals, recall_vals, _ = precision_recall_curve(y_test, mnb.predict_proba(X_test_tfidf)[:, 1])

# Plot
plt.figure(figsize=(8, 6))
sns.lineplot(x=recall_vals, y=precision_vals)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid()
plt.show()

### ROC Curve

In [None]:
fpr, tpr, _ = roc_curve(y_test, mnb.predict_proba(X_test_tfidf)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot
plt.figure(figsize=(8, 6))
sns.lineplot(x=fpr, y=tpr, label='AUC = {:.2f}'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid()
plt.show()

### Cumulative Gain/Lift Chart

In [None]:
# Get precision and recall values
precision_vals, recall_vals, thresholds = precision_recall_curve(y_test, mnb.predict_proba(X_test_tfidf)[:, 1])

# Calculate cumulative gain
cumulative_gain = np.cumsum(precision_vals) / np.sum(y_test)

# Plot
plt.figure(figsize=(8, 6))
plt.plot(np.arange(len(cumulative_gain)), cumulative_gain, label='Cumulative Gain', color='blue')
plt.axhline(y=np.sum(y_test) / len(y_test), color='red', linestyle='--', label='Random Guessing')
plt.xlabel('Number of Samples')
plt.ylabel('Cumulative Gain')
plt.title('Cumulative Gain Chart')
plt.legend()
plt.grid()
plt.show()

### Comparison of Precision and Recall

In [None]:
# Plot Precision-Recall trade-off
plt.figure(figsize=(8, 6))
plt.plot(thresholds, precision_vals[:-1], label='Precision', color='blue')
plt.plot(thresholds, recall_vals[:-1], label='Recall', color='orange')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs. Threshold')
plt.legend()
plt.grid()
plt.show()

# Deeper Analysis of the Data

### Counts of Spam vs Ham

In [None]:
# ratio of spam to ham
sms_spam = sms_df[sms_df.label == "spam"]
sms_ham = sms_df[sms_df.label == "ham"]

count_spam = sms_spam.shape[0]
count_ham = sms_ham.shape[0]
ratio_spam = count_spam / (count_spam + count_ham)

In [None]:
# count plot of spam to ham
sns.countplot(sms_df, x="label")
plt.title("Count of Spam and Ham labels")
plt.show()

print(f"Percent Spam: {ratio_spam*100}%")

### Top Words in Spam vs Ham

In [None]:
# Get the feature names
feature_names = tfidf.get_feature_names_out()

# Get the log probabilities of the features
log_probs = mnb.feature_log_prob_

# Create a DataFrame for the words and their probabilities
spam_words = pd.DataFrame({
    'Word': feature_names,
    'Spam Probability': log_probs[1],
    'Ham Probability': log_probs[0]
})

# Calculate the difference
spam_words['Difference'] = spam_words['Spam Probability'] - spam_words['Ham Probability']

# Get the top words for spam
top_spam_words = spam_words.nlargest(20, 'Difference')

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Difference', y='Word', data=top_spam_words, palette='Reds', hue="Word", legend=False)
plt.title('Top Words Indicating Spam')
plt.xlabel('Log Probability Difference (Spam - Ham)')
plt.ylabel('Words')
plt.show()

### Word Cloud for Spam Messages

In [None]:
# Filter spam messages
spam_messages = sms_df[sms_df['type'] == 1]['message']

# Create a single string of all spam messages
spam_text = ' '.join(spam_messages)

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(spam_text)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Spam Messages')
plt.show()

### Distribution of Message Lengths

In [None]:
# Calculate message lengths
sms_df['length'] = sms_df['message'].apply(len)

# Plot the distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=sms_df, x='length', hue='label', multiple='stack', bins=30, kde=True)
plt.title('Distribution of Message Lengths')
plt.xlabel('Length of Message')
plt.ylabel('Frequency')
plt.show()

### Box Plot of Message Lengths

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='label', y='length', data=sms_df)
plt.title('Box Plot of Message Lengths by Type')
plt.xlabel('Message Type')
plt.ylabel('Length of Message')
plt.show()

### Heatmap of Word Correlation

In [None]:
# Create a CountVectorizer to get word counts with a limit on max features
count_vectorizer = CountVectorizer(max_features=50)  # Adjust max_features as needed
count_data = count_vectorizer.fit_transform(X_train)

# Calculate the correlation matrix directly from the sparse matrix
correlation_matrix = (count_data.T @ count_data).toarray()  # Using matrix multiplication
correlation_matrix = np.corrcoef(correlation_matrix)  # Calculate correlation coefficients

# Create a DataFrame for the correlation matrix
correlation_df = pd.DataFrame(correlation_matrix, index=count_vectorizer.get_feature_names_out(), columns=count_vectorizer.get_feature_names_out())

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_df, cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Word Correlation Heatmap')
plt.show()

### Feature Importance Bar Plot

In [None]:
# Get the feature names
feature_names = tfidf.get_feature_names_out()

# Get the log probabilities of the features
log_probs = mnb.feature_log_prob_

# Create a DataFrame for the words and their probabilities
spam_words = pd.DataFrame({
    'Word': feature_names,
    'Spam Probability': log_probs[1],
    'Ham Probability': log_probs[0]
})

# Calculate the difference
spam_words['Difference'] = spam_words['Spam Probability'] - spam_words['Ham Probability']

# Get the top words for spam
top_spam_words = spam_words.nlargest(20, 'Difference')
top_ham_words = spam_words.nsmallest(20, 'Difference')

# Combine the two DataFrames
top_features = pd.concat([top_ham_words, top_spam_words])

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Difference', y='Word', data=top_features, palette='coolwarm', hue="Word", legend=False)
plt.title('Top Words Indicating Spam and Ham')
plt.xlabel('Log Probability Difference (Spam - Ham)')
plt.ylabel('Words')
plt.show()

### Distribution of Predicted Probabilities

In [None]:
# Get predicted probabilities
predicted_probabilities = mnb.predict_proba(X_test_tfidf)[:, 1]

# Create a DataFrame for the predicted probabilities and actual labels
probability_df = pd.DataFrame({'Predicted Probability': predicted_probabilities, 'Actual': y_test})

# Plot the distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=probability_df, x='Predicted Probability', hue='Actual', multiple='stack', bins=30, kde=True)
plt.title('Distribution of Predicted Probabilities')
plt.xlabel('Predicted Probability of Spam')
plt.ylabel('Frequency')
plt.yscale('log')
plt.show()