In [11]:

# ========================= MODELING PART =========================
import numpy as np
import re
import string
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV


# Text cleaning
def normalize_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text


# Extra numeric features
def extra_features(X):
    if isinstance(X, pd.DataFrame):
        txt = X.iloc[:, 0]
    else:
        txt = X

    return np.column_stack([
        txt.str.len(),
        txt.str.split().str.len(),
        txt.str.count("!"),
        txt.str.count(r"\d"),
        txt.apply(lambda t: sum(c.isupper() for c in t) / (len(t) + 1))
    ])


# Feature pipeline
preprocess = ColumnTransformer([
    ("tfidf", TfidfVectorizer(
        preprocessor=normalize_text,
        stop_words="english",
        ngram_range=(1, 2),
        max_features=3000
    ), "text"),

    ("stats", FunctionTransformer(extra_features, validate=False), ["text"])
])


def build_model(model):
    pipe = Pipeline([
        ("prep", preprocess),
        ("clf", model)
    ])
    return pipe


def evaluate(model, X, y):
    print("Accuracy:", accuracy_score(y, model.predict(X)))
    print(classification_report(y, model.predict(X)))


# Load splits
train_data = pd.read_csv("train_data.csv")
val_data   = pd.read_csv("val_data.csv")
test_data  = pd.read_csv("test_data.csv")


# Train baseline models
models = {
    "LogReg": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB(),
    "LinearSVM": LinearSVC()
}

scores = {}

for name, algo in models.items():
    print("=" * 80)
    print("Training:", name)

    model = build_model(algo)
    model.fit(train_data[["text"]], train_data["category"])

    train_acc = accuracy_score(train_data["category"], model.predict(train_data[["text"]]))
    val_acc   = accuracy_score(val_data["category"], model.predict(val_data[["text"]]))

    scores[name] = (model, val_acc)

    print("Train Acc:", train_acc)
    print("Val Acc:", val_acc)


# Hyperparameter tuning
params = {
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__solver": ["liblinear"]
}

grid_model = GridSearchCV(
    build_model(LogisticRegression(max_iter=1000)),
    params,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_model.fit(train_data[["text"]], train_data["category"])

print("Best Params:", grid_model.best_params_)
print("Tuned Val Accuracy:",
      accuracy_score(val_data["category"],
                     grid_model.predict(val_data[["text"]])))


# Select best baseline
best_name = max(scores, key=lambda k: scores[k][1])
best_model = scores[best_name][0]

print("Best Baseline Model:", best_name)


# Test evaluation
print("=" * 80)
print("Test Performance")
evaluate(best_model, test_data[["text"]], test_data["category"])


# ========================= PREDICTION =========================
def predict_sms(model, message):
    df = pd.DataFrame({"text": [message]})
    pred = model.predict(df)[0]
    prob = None

    if hasattr(model.named_steps["clf"], "predict_proba"):
        prob = model.predict_proba(df)[0]

    return pred, prob


sample_msgs = [
    "Limited time offer! Get a loan approved instantly with zero paperwork.",
    "Maa said dinner is ready, come home soon.",
    "Your bank account will be blocked. Verify details immediately.",
    "Can you send me yesterday's lecture notes?",
    "Exclusive deal!!! Buy 1 get 3 FREE. Click the link now!!!",
    "I'll be late today, stuck in traffic.",
    "You have been selected for a lucky draw reward worth ₹1 lakh.",
    "Meeting has been postponed to tomorrow 10 AM.",
    "Act now! Your SIM card will be deactivated within 24 hours.",
    "Happy birthday! Have an amazing year ahead "
]

for msg in sample_msgs:
    label, prob = predict_sms(best_model, msg)
    print("-" * 60)
    print("Message:", msg)
    print("Prediction:", "SPAM" if label == 1 else "HAM")
    if prob is not None:
        print("Probabilities [HAM, SPAM]:", prob)


Training: LogReg
Train Acc: 0.9792703150912107
Val Acc: 0.9690322580645161
Training: NaiveBayes
Train Acc: 0.9709784411276948
Val Acc: 0.9625806451612903
Training: LinearSVM
Train Acc: 0.9983416252072969
Val Acc: 0.9806451612903225
Best Params: {'clf__C': 10, 'clf__solver': 'liblinear'}
Tuned Val Accuracy: 0.9806451612903225
Best Baseline Model: LinearSVM
Test Performance
Accuracy: 0.9858247422680413
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       678
           1       0.97      0.92      0.94        98

    accuracy                           0.99       776
   macro avg       0.98      0.96      0.97       776
weighted avg       0.99      0.99      0.99       776

------------------------------------------------------------
Message: Limited time offer! Get a loan approved instantly with zero paperwork.
Prediction: HAM
------------------------------------------------------------
Message: Maa said dinner is ready, come home soon.
