In [36]:
import pandas as pd
import numpy as np
import re
import string

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=['label', 'message'])

x , y = df.shape[0], df.shape[1]
print(f"No. of rows = {x} and No. of columns = {y}")

print(df.sample(10))
print(f"\nThe shape of the Dataset: {df.shape}")

No. of rows = 5572 and No. of columns = 2
     label                                            message
756    ham                            So why didnt you holla?
1870   ham                     Mom wants to know where you at
3404   ham       Good night my dear.. Sleepwell&amp;Take care
1402   ham  Kaiez... Enjoy ur tuition... Gee... Thk e seco...
1973   ham  Yes but can we meet in town cos will go to gep...
4826   ham         I am going to sleep. I am tired of travel.
2054   ham  Oh... I was thkin of goin yogasana at 10 den n...
2202   ham  A boy was late 2 home. His father: "POWER OF F...
4128   ham                             Sorry, I'll call later
3690   ham                          You still coming tonight?

The shape of the Dataset: (5572, 2)


In [37]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
ck = df.shape[0]
co = x- ck
print(f"No. of rows removed due to duplicates = {co}")

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = " ".join(text.split())
    return text

df['clean_message'] = df['message'].apply(clean_text)
print(df[['message', 'clean_message']].sample(5))

No. of rows removed due to duplicates = 403
                                                message  \
2604  Im at arestaurant eating squid! i will be out ...   
4313           I keep ten rs in my shelf:) buy two egg.   
1292  Hey babe! I saw you came online for a second a...   
3850                                   U in town alone?   
4230  Have you bookedthe hut? And also your time off...   

                                          clean_message  
2604  im at arestaurant eating squid i will be out a...  
4313              i keep ten rs in my shelf buy two egg  
1292  hey babe i saw you came online for a second an...  
3850                                    u in town alone  
4230  have you bookedthe hut and also your time off ...  


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')

X = tfidf.fit_transform(df['clean_message'])
y = df['label']

print(f"Feature matrix shape after vectorized: {X.shape}")

Feature matrix shape after vectorized: (5169, 8341)


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print(f"No. of values in training set: {X_train.shape[0]}")
print(f"No. of values in test set: {X_test.shape[0]}")

No. of values in training set: 4135
No. of values in test set: 1034


In [40]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
print("Model training complete.")

Model training complete.


In [42]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)

print("--- Metrics using Multimodal Naive Bias---")
print(classification_report(y_test, y_pred))

--- Metrics using Multimodal Naive Bias---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       894
        spam       1.00      0.76      0.86       140

    accuracy                           0.97      1034
   macro avg       0.98      0.88      0.92      1034
weighted avg       0.97      0.97      0.97      1034



In [43]:
def predict_spam(text):
    cleaned_text = clean_text(text)
    vectorized_input = tfidf.transform([cleaned_text])
    prediction = model.predict(vectorized_input)
    return prediction[0]

test_msg1 = "Congratulations! You won $1000."
test_msg2 = "Request for internship"

print(f"Test 1: '{test_msg1}' -> Result: {predict_spam(test_msg1)}")
print(f"Test 2: '{test_msg2}' -> Result: {predict_spam(test_msg2)}")

Test 1: 'Congratulations! You won $1000.' -> Result: spam
Test 2: 'Request for internship' -> Result: ham


In [44]:
words = tfidf.get_feature_names_out()
spam_probs = model.feature_log_prob_[1]

word_importance = pd.DataFrame({'word': words, 'importance': spam_probs})
print("Checking the Feature Importancce (most spam like words)")
print(word_importance.sort_values(by='importance', ascending=False).head(10))

Checking the Feature Importancce (most spam like words)
        word  importance
2559    free   -5.975126
4470  mobile   -6.304349
1232   claim   -6.326930
7141    text   -6.331545
7507     txt   -6.344608
6798    stop   -6.419383
5879   reply   -6.430322
5538   prize   -6.486446
7633      ur   -6.486636
7636  urgent   -6.667895


In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_train_num = y_train.map({'ham': 0, 'spam': 1})
y_test_num = y_test.map({'ham': 0, 'spam': 1})

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(eval_metric='logloss')
rf_model.fit(X_train, y_train_num)
xgb_model.fit(X_train, y_train_num)

print("Ensemble models (RF & XGB) trained successfully.")

Ensemble models (RF & XGB) trained successfully.


In [46]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

models_list = { "Naive Bayes": (model, y_test),
    "Logistic Regression": (lr_model, y_test),
    "Random Forest": (rf_model, y_test_num),
    "XGBoost": (xgb_model, y_test_num)}

comparison = []
for name, (clf, target) in models_list.items():
    preds = clf.predict(X_test)
    pos_label = 1 if name in ["Random Forest", "XGBoost"] else 'spam'

    comparison.append({"Model": name,
        "Accuracy": accuracy_score(target, preds),
        "Precision": precision_score(target, preds, pos_label=pos_label),
        "Recall": recall_score(target, preds, pos_label=pos_label),
        "F1-Score": f1_score(target, preds, pos_label=pos_label)})

df_metrics = pd.DataFrame(comparison)
print(df_metrics.sort_values(by="F1-Score", ascending=False))

                 Model  Accuracy  Precision    Recall  F1-Score
2        Random Forest  0.968085   1.000000  0.764286  0.866397
0          Naive Bayes  0.967118   1.000000  0.757143  0.861789
3              XGBoost  0.966151   0.964602  0.778571  0.861660
1  Logistic Regression  0.950677   0.978495  0.650000  0.781116
