In [1]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
data = pd.read_csv(r'C:\Users\admin\Downloads\mail_data.csv', encoding='unicode escape')

In [3]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
data['Category'] = data['Category'].map({'spam': 0, 'ham': 1})
x = data['Message']
y = data['Category']

# Step 6: Train-test split with stratify
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=3)


In [5]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)


In [6]:
x_train_features = vectorizer.fit_transform(x_train)
x_test_features = vectorizer.transform(x_test)


In [7]:
# Convert labels to int
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [8]:
model = MultinomialNB()
model.fit(x_train_features, y_train)

In [15]:
# Step 9: Evaluation
predictions = model.predict(x_test_features)

print("\n✅ Accuracy:", accuracy_score(y_test, predictions))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\n📊 Classification Report:\n", classification_report(y_test, predictions))

# Step 10: Predict on new input
input_mail = ['SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info']
input_data = vectorizer.transform(input_mail)
prediction = model.predict(input_data)

if prediction[0] == 0:
    print("\n🚫 Spam")
else:
    print("\n✅ Not a Spam")
    




✅ Accuracy: 0.9713004484304932

🧮 Confusion Matrix:
 [[117  32]
 [  0 966]]

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.79      0.88       149
           1       0.97      1.00      0.98       966

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


🚫 Spam
