In [8]:
import re   # re is the regular expressions module in Python, used for searching, matching, and manipulating strings using regex patterns
import joblib # is a library often used for saving and loading Python objects efficiently, especially large ones
from typing import List, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score




# Load the data  -- >   Two columns   1.text (email body/subject)  , 2. label( 0 = ham/not_spam , 1 = spam)

def build_demo_dataframe() -> pd.DataFrame:
    return pd.DataFrame({
        "text": [
            # spam
            "limited time offer!!! click on this link to win a prize",
            "URGENT: Your account has been suspended please verify now.",
            "you won a lottery, claim your cash reward now!",
            "Free gift card!!! click here to redeem today",
            # ham
            "Hey, we can reschedule our meeting tomorrow?",
            "here are the slides from the class -- see you!",
            "Lunch at 1pm? Let's meet in the cafeteria.",
            "Project update attached. Please review when you can."
        ],
        "label": [1, 1, 1, 1, 0, 0, 0, 0]
    })


# light text cleaner


def clean_text(s: str) -> str:
    s = s.lower()
    
    s = re.sub(r"(https?://\S+|www\.\S+)", " URL ", s)  # correct URL pattern and keep surrounding spaces
  
    s = re.sub(r"[^a-z0-9@.\s]", " ", s)   # keep alphanumerics, @, dot, whitespace

    s = re.sub(r"\s+", " ", s).strip()     # collapse to ONE space (not zero)
    return s


def apply_cleaning(df : pd.DataFrame ,text_col: str ="text") -> pd.DataFrame:
    out = df.copy()
    out[text_col] = out[text_col].astype(str).map(clean_text)
    return out


# Model PipeLine

def make_pipeline() -> Pipeline:
    """ TF - IDF 9 (uni- bi- grams) + logistic regression """
    return Pipeline(

        steps=[
            ("tfidf", TfidfVectorizer(ngram_range =(1,2), min_df =1)),
            ("clf", LogisticRegression(max_iter =2000)),
        ]
    )



# train test the model


def train_and_evaluate(
    df:pd.DataFrame,text_col: str ="text" , label_col: str ="label" , test_size:float = 0.2) -> tuple[Pipeline, dict]:

    x= df[text_col].values
    y= df[label_col].values


    x_train ,x_test ,y_train ,y_test = train_test_split(x,y, test_size=test_size, random_state =  30, stratify =y)


    pipe = make_pipeline()
    pipe.fit(x_train , y_train)

    y_pred = pipe.predict(x_test)

    metrics ={
         "accuracy": float(accuracy_score(y_test, y_pred)),
        "confusion_matrix" : confusion_matrix(y_test,y_pred).tolist(),
        "classification_report": classification_report(y_test, y_pred, digits= 3),
    }


    return pipe, metrics




# presist / load


def save_model(model: Pipeline, path :str) -> str:
    Path(path).parent.mkdir(parents =True , exist_ok = True)
    joblib.dump(model,path)
    return path

def load_model(path:str) -> Pipeline:
    return joblib.load(path)


# Inference helper

def predict_texts(model: Pipeline, texts: List[str]) -> List[int]:
    cleaned =[clean_text(t) for t in texts]
    preds= model.predict(cleaned)
    return preds.tolist()



# Main

def main():
  # 1. Load/prepare your data

  df = build_demo_dataframe()
  df = apply_cleaning(df, text_col ="text")

 # 2. Train/evaluate

  model, metrics = train_and_evaluate(df, text_col ="text", label_col ="label", test_size =0.25)
  print("accuracy:" , metrics["accuracy"])
  print("Confusion_matrix:", metrics["confusion_matrix"])
  print("classification_report:\n",metrics["classification_report"] )


 # 3. save model

  model_path = "spam_text_classifier.joblib"
  save_model(model, model_path)
  print(f"Model saved to : {model_path}")

# 4. Demo Inference

  demo_texts =[
         "Claim your prize now!!! Click here.",
        "Can we move the meeting to tomorrow afternoon?",
        "Your account has been suspended. Verify your account immediately",
        "you won a lottery, claim your cash reward now!",
        "Here are the notes from the class.",
        "Let's grab coffee at 10am next Tuesday.",
      
  ]



  demo_preds = predict_texts(model, demo_texts)
  for t, p in zip(demo_texts , demo_preds):
      label ="spam" if p==1 else "ham"
      print(f"[{label}]  {t}")



if __name__ =="__main__":
    main()





accuracy: 1.0
Confusion_matrix: [[1, 0], [0, 1]]
classification_report:
               precision    recall  f1-score   support

           0      1.000     1.000     1.000         1
           1      1.000     1.000     1.000         1

    accuracy                          1.000         2
   macro avg      1.000     1.000     1.000         2
weighted avg      1.000     1.000     1.000         2

Model saved to : spam_text_classifier.joblib
[spam]  Claim your prize now!!! Click here.
[ham]  Can we move the meeting to tomorrow afternoon?
[spam]  Your account has been suspended. Verify your account immediately
[spam]  you won a lottery, claim your cash reward now!
[ham]  Here are the notes from the class.
[ham]  Let's grab coffee at 10am next Tuesday.
