In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [17]:
# Load the Dataset
data= pd.read_csv("/email.csv")
print(data.head())

# Standardize the column names
data.columns= data.columns.str.strip()

# Map categories safely
data['Category']= data['Category'].str.strip().str.lower().map({'ham': 0, 'spam': 1})

# Drop rows where mapping failed (NaN)
data= data.dropna(subset=['Category', 'Message'])

# Convert to integer after dropping NaNs
data['Category']= data['Category'].astype(int)

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [18]:
# Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['Message'], data['Category'], test_size=0.2, random_state=42
)

# Vectorize the Dataset
vectorizer= CountVectorizer()
X_train_counts= vectorizer.fit_transform(X_train)
X_test_counts= vectorizer.transform(X_test)

In [19]:
# Train Naive Bayes model
model = MultinomialNB(alpha=0.1)
model.fit(X_train_counts, y_train)

# Evaluate Accuracy
y_pred = model.predict(X_test_counts)
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))

Accuracy: 0.989


In [20]:
# Bayesian Decision with Asymmetric Costs
cost_spam_as_legit= 3
cost_legit_as_spam= 5

In [21]:
# Example message
msg= ["Limited time deal waiting for you"]
msg_vec= vectorizer.transform(msg)

In [22]:
# Original prediction
original_class = model.predict(msg_vec)[0]
print("\nOriginal Prediction (without costs):", "SPAM" if original_class==1 else "LEGIT")


Original Prediction (without costs): LEGIT


In [23]:
# Posterior probabilities
probs = model.predict_proba(msg_vec)[0]
P_legit_given_msg= probs[0]
P_spam_given_msg= probs[1]

In [24]:
# Expected risks
risk_spam= cost_legit_as_spam*P_legit_given_msg
risk_legit= cost_spam_as_legit*P_spam_given_msg

print("\nP(Spam | message):", round(P_spam_given_msg,3))
print("P(Legit | message):", round(P_legit_given_msg,3))
print("Risk if classify as SPAM:", round(risk_spam,3))
print("Risk if classify as LEGIT:", round(risk_legit,3))


P(Spam | message): 0.001
P(Legit | message): 0.999
Risk if classify as SPAM: 4.996
Risk if classify as LEGIT: 0.002


In [25]:
# Final decision
decision = "SPAM" if risk_spam < risk_legit else "LEGIT"
print("\nDecision (Bayesian with cost):", decision)


Decision (Bayesian with cost): LEGIT
