In [64]:
import re
import joblib
from pathlib import Path
from typing import List, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib  # is a library often used for saving and loading Python objects efficiently, especially large ones
import re   # re is the regular expressions module in Python, used for searching, matching, and manipulating strings using regex patterns



# Load the data  -- >   Two columns   1.text (email body/subject)  , 2. lable( 0 = ham/not_spam , 1 = spam)

def build_demo_dataframe() -> pd.DataFrame:
    return pd.DataFrame({
        "text": [
            "limited time offer!!! click on this link to win a prize",  # spam
            "URGENT: Your account has been suspended please verify now.",  # spam
            "you won a lottery, claim your cash reward now!",  # spam
            "Hey, we can reschedule our meeting tomorrow?",  # ham
            "here are the slides from the class -- see you!",  # ham
            "Don’t forget our coffee meeting later today",  # ham
            "please review the attached report and share feedback",  # ham
            
        ],
        "lable": [1, 1, 1, 0, 0, 0, 0]
    })


# light text cleaner

def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"(https?://\S+| www\.S+)", "URL", s)
    s = re.sub(r"[^a-z0-9@.\s]", "", s)
    s = re.sub(r"\s+", "", s).strip()
    
    return s



def apply_cleaning(df : pd.DataFrame ,text_col: str ="text") -> pd.DataFrame:
    out = df.copy()
    out[text_col] = out[text_col].astype(str).map(clean_text)
    return out


# Model PipeLine

def make_pipeline() -> Pipeline:
    """ TF - IDF 9 (uni- bi- grams) + logistic regression """
    return Pipeline(

        steps=[
            ("tfidf", TfidfVectorizer(ngram_range =(1,2), min_df =1)),
            ("clf", LogisticRegression(max_iter =2000)),
        ]
    )



# train test the model


def train_and_evaluate(
    df:pd.DataFrame,text_col: str ="text" , lable_col: str ="lable" , test_size:float = 0.2) -> tuple[Pipeline, dict]:

    x= df[text_col].values
    y= df[lable_col].values


    x_train ,x_test ,y_train ,y_test = train_test_split(x,y, test_size=test_size, random_state =  30, stratify =y)


    pipe = make_pipeline()
    pipe.fit(x_train , y_train)

    y_pred = pipe.predict(x_test)

    metrics ={
         "accuracy": float(accuracy_score(y_test, y_pred)),
        "confusion_matrix" : confusion_matrix(y_test,y_pred).tolist(),
        "classification_report": classification_report(y_test, y_pred, digits= 3),
    }


    return pipe, metrics




# presist / load


def save_model(model: Pipeline, path :str) -> str:
    Path(path).parent.mkdir(parents =True , exist_ok = True)
    joblib.dump(model,path)
    return path

def load_model(path:str) -> Pipeline:
    return joblib.load(path)


# Inference helper

def predict_texts(model: Pipeline, texts: List[str]) -> List[int]:
    cleaned =[clean_text(t) for t in texts]
    preds= model.predict(cleaned)
    return preds.tolist()



# Main

def main():
  # 1. Load/prepare your data

  df = build_demo_dataframe()
  df = apply_cleaning(df, text_col ="text")

 # 2. Train/evaluate

  model, metrics = train_and_evaluate(df, text_col ="text", lable_col ="lable", test_size =0.25)
  print("accuracy:" , metrics["accuracy"])
  print("Confusion_matrix:", metrics["confusion_matrix"])
  print("classification_report:\n",metrics["classification_report"] )


 # 3. save model

  model_path = "spam_text_classifier.joblib"
  save_model(model, model_path)
  print(f"Model saved to : {model_path}")

# 4. Demo Inference

  demo_texts =[

      "Claim your prize now!!! Click here.",
      "here are the slides from the class -- see you!",
      "Can we Move the Meeting to tomorrow afternoon?",
      "Your acccount has been suspended. verify your account immediately",
      "you won a lottery , claim your cash reward now!",
      "here are the notes from the class",
       "URGENT: Your account has been suspended please verify now.", 
      
  ]



  demo_preds = predict_texts(model, demo_texts)
  for t, p in zip(demo_texts , demo_preds):
      label ="spam" if p==1 else "ham"
      print(f"[{label}]  {t}")



if __name__ =="__main__":
    main()





accuracy: 0.5
Confusion_matrix: [[1, 0], [1, 0]]
classification_report:
               precision    recall  f1-score   support

           0      0.500     1.000     0.667         1
           1      0.000     0.000     0.000         1

    accuracy                          0.500         2
   macro avg      0.250     0.500     0.333         2
weighted avg      0.250     0.500     0.333         2

Model saved to : spam_text_classifier.joblib
[ham]  Claim your prize now!!! Click here.
[ham]  here are the slides from the class -- see you!
[ham]  Can we Move the Meeting to tomorrow afternoon?
[ham]  Your acccount has been suspended. verify your account immediately
[spam]  you won a lottery , claim your cash reward now!
[ham]  here are the notes from the class
[ham]  URGENT: Your account has been suspended please verify now.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
