In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- 1. Data Loading and Exploration ---
data = pd.read_csv("spam.csv", encoding='latin-1')
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
data = data.rename(columns={'v1': 'label', 'v2': 'text'})

print(data.head())
print(data.info())
print(data.isnull().sum())

sns.countplot(x='label', data=data)
plt.title('Distribution of Spam and Ham')
plt.show()

data['message_length'] = data['text'].apply(len)
plt.figure(figsize=(8, 6))
sns.histplot(data=data, x='message_length', hue='label', bins=50)
plt.title('Message Length Distribution')
plt.show()

# --- 2. Data Preprocessing ---
nltk.download('stopwords')
nltk.download('punkt')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    tokens = nltk.word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    return text

data['processed_text'] = data['text'].apply(preprocess_text)

# --- 3. Feature Extraction ---
# CountVectorizer (Bag-of-Words)
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(data['processed_text'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data['processed_text'])

# --- 4. Model Training ---
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['label'], test_size=0.2, random_state=42)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

# --- 5. Model Evaluation ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=['ham', 'spam'], yticklabels=['ham', 'spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# --- 6. Interpretation and Insights ---
feature_names = tfidf_vectorizer.get_feature_names_out()

def get_top_words(model, feature_names, n=10):
    log_probs_spam = model.feature_log_prob_[1]
    log_probs_ham = model.feature_log_prob_[0]

    spam_words = pd.DataFrame({'word': feature_names, 'log_prob': log_probs_spam}).sort_values('log_prob', ascending=False)
    ham_words = pd.DataFrame({'word': feature_names, 'log_prob': log_probs_ham}).sort_values('log_prob', ascending=False)

    print(f"Top {n} words associated with spam:")
    print(spam_words.head(n))

    print(f"\nTop {n} words associated with ham:")
    print(ham_words.head(n))

get_top_words(nb_model, feature_names)