In [127]:
# ---------------------------
# SMS -> Sentence -> Word2Vec
# Classification pipeline
# ---------------------------

# --- Standard library imports ---
import re                       # regular expressions for cleaning text
import numpy as np              # numerical arrays and vector math
import pandas as pd             # DataFrame for tidy outputs and inspection

# --- Third-party NLP / ML imports ---
from gensim.utils import simple_preprocess     # light tokenizer + filtering (lowercase, short tokens)
import gensim                                   # for Word2Vec training
from nltk.tokenize import sent_tokenize         # sentence splitter (preserves sentence boundaries)
from nltk.corpus import stopwords               # english stopwords list
from tqdm import tqdm                           # progress bars (handy for longer corpora)
from sklearn.preprocessing import LabelEncoder  # convert string labels to integers
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---------------------------
# 1) Load the raw dataset
# ---------------------------
# The dataset file SMSSpamCollection.txt is expected to be tab-separated (label \t message)
# and contains lines like: "ham\tGo until jurong point, ...".
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'message'])
# `messages` is now a DataFrame with two columns: 'label' (ham/spam) and 'message' (raw SMS text).

# ---------------------------
# 2) Prepare stopwords and containers
# ---------------------------
# Convert NLTK stopwords list to a set for O(1) average-time membership checks.
stop_words = set(stopwords.words('english'))

# We'll collect cleaned sentence-level representations here:
sentence_texts = []   # readable cleaned sentence strings (e.g., "free win prize")
sentence_tokens = []  # token lists per sentence (e.g., ["free","win","prize"])
sentence_labels = []  # the original message-level label repeated for each sentence

# ---------------------------
# 3) Sentence-level preprocessing
# ---------------------------
# We iterate each message, split it into sentences, clean tokens and keep only non-empty sentences.
for idx, raw_msg in enumerate(messages['message']):
    # sent_tokenize works on the raw message to preserve sentence boundaries
    # (do NOT pre-clean punctuation before sentence-splitting ‚Äî that can break boundary detection).
    for sent in sent_tokenize(raw_msg):
        # Remove any non-alphabet characters and lowercase the sentence.
        # This removes numbers, punctuation, URLs, etc., leaving only letters and spaces.
        cleaned = re.sub('[^a-zA-Z]', ' ', sent).lower()
        # Use gensim.simple_preprocess to tokenize, lowercase (already lower), and remove too-short tokens.
        toks = simple_preprocess(cleaned)  # example: ["free", "entry", "win"]
        # Remove stopwords to keep the most informative tokens
        toks = [w for w in toks if w not in stop_words]
        if len(toks) == 0:
            # Skip sentences that become empty after cleaning & stopword removal.
            # Important: when we skip, we do NOT append a label ‚Äî this preserves alignment.
            continue
        # Keep a readable sentence text (joined tokens) for later inspection
        sentence_texts.append(' '.join(toks))
        # Keep the token list for Word2Vec training and embedding creation
        sentence_tokens.append(toks)
        # Repeat the original parent message label for this sentence so labels stay aligned
        sentence_labels.append(messages['label'].iloc[idx])

# Quick sanity assertion: all three lists must be of equal length.
assert len(sentence_texts) == len(sentence_tokens) == len(sentence_labels), \
    f"Lengths mismatch: {len(sentence_texts)}, {len(sentence_tokens)}, {len(sentence_labels)}"

# Print counts to confirm how many sentences we obtained vs original messages
print("messages rows:", len(messages))
print("sentence rows after cleaning:", len(sentence_texts))

# ---------------------------
# 4) Train Word2Vec on sentence tokens
# ---------------------------
# Use min_count=1 to include rare words in embedding vocab (avoids many empty sentence vectors).
# vector_size=100 sets embedding dimensionality; window=5 controls context window size.
# workers uses multiple cores for faster training; epochs controls number of passes.
model = gensim.models.Word2Vec(
    sentence_tokens,
    vector_size=100,
    window=5,
    min_count=1,   # include rare tokens; change to >=2 to ignore very rare words
    workers=4,
    epochs=5
)

# ---------------------------
# 5) Build averaged sentence vectors (skip empties)
# ---------------------------
def avg_word2vec_from_tokens(tokens, model):
    """
    Given a list of tokens and a gensim Word2Vec model,
    return the averaged word vector (mean of in-vocab token vectors).
    If none of the tokens are in the model vocab, return None to indicate skip.
    """
    # Collect vectors only for tokens present in model vocabulary
    vecs = [model.wv[w] for w in tokens if w in model.wv.key_to_index]
    if not vecs:
        # Return None to signal that this sentence has no in-vocab words
        return None
    # Return the mean vector (shape: (vector_size,))
    return np.mean(vecs, axis=0)

# We'll build lists and keep only sentences that produced a real vector
X_list = []      # will hold 1D numpy arrays (sentence vectors)
y_list = []      # corresponding labels (strings at this point)
kept_texts = []  # readable texts for the sentences we kept

# Iterate aligned triples and compute averaged vectors
for toks, label, text in zip(sentence_tokens, sentence_labels, sentence_texts):
    v = avg_word2vec_from_tokens(toks, model)
    if v is None:
        # Skip sentences that yield no in-vocabulary vectors (rare after min_count=1)
        # This keeps X/y aligned: we only append label/text when v is available.
        continue
    X_list.append(v)
    y_list.append(label)
    kept_texts.append(text)

# Stack the list of 1D vectors into a 2D array (n_samples, vector_size)
X = np.vstack(X_list)
# Convert labels into a NumPy array (strings for now)
y = np.array(y_list)

# Print shapes so we can debug alignment problems quickly
print("avg_vectors.shape:", X.shape)   # expected (n_kept_sentences, vector_size)
print("y.shape:", y.shape)
print("kept_texts:", len(kept_texts))

# ---------------------------
# 6) Label encoding (ham/spam -> 0/1)
# ---------------------------
le = LabelEncoder()
# Fit the encoder on string labels then transform to integers
y_encoded = le.fit_transform(y)   # e.g., ['ham','spam'] -> [0,1] mapping shown below
print("label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# ---------------------------
# 7) Train / Test split
# ---------------------------
# We split X and y_encoded into train and test sets.
# We also keep the sample indices (np.arange(len(X))) so we can retrieve original texts for inspection.
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X,
    y_encoded,
    np.arange(len(X)),       # indices to map back to kept_texts
    test_size=0.2,
    random_state=42,
    stratify=y_encoded       # maintain class balance between train and test
)

# ---------------------------
# 8) Train classifier (RandomForest)
# ---------------------------
# RandomForest is a robust, non-linear classifier. n_estimators sets number of trees.
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)  # learn mapping from averaged embeddings -> labels

# ---------------------------
# 9) Evaluate on test set
# ---------------------------
y_pred = clf.predict(X_test)  # predicted integer labels for test set

# Accuracy (overall fraction correct)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion matrix (rows=true, cols=pred) to inspect false positives / negatives
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# A full classification report (precision, recall, f1) with readable class names
print(classification_report(y_test, y_pred, target_names=le.classes_))

# ---------------------------
# 10) Print sample predictions with readable text
# ---------------------------
# Use idx_test to map test rows back to their original cleaned sentence text in kept_texts
for i in range(10):
    # kept_texts[idx_test[i]] is the cleaned sentence corresponding to sample i in the test split
    print("text:", kept_texts[idx_test[i]])
    print("pred:", 'SPAM' if y_pred[i] == 1 else 'HAM', "actual:", 'SPAM' if y_test[i] == 1 else 'HAM')
    print("-" * 60)


messages rows: 5572
sentence rows after cleaning: 10713
avg_vectors.shape: (10713, 100)
y.shape: (10713,)
kept_texts: 10713
label mapping: {np.str_('ham'): np.int64(0), np.str_('spam'): np.int64(1)}
Accuracy: 0.918338777414839
Confusion matrix:
 [[1681   18]
 [ 157  287]]
              precision    recall  f1-score   support

         ham       0.91      0.99      0.95      1699
        spam       0.94      0.65      0.77       444

    accuracy                           0.92      2143
   macro avg       0.93      0.82      0.86      2143
weighted avg       0.92      0.92      0.91      2143

text: way back call
pred: HAM actual: HAM
------------------------------------------------------------
text: sms
pred: SPAM actual: SPAM
------------------------------------------------------------
text: okie thanx
pred: HAM actual: HAM
------------------------------------------------------------
text: speak live operator claim call pm
pred: SPAM actual: SPAM
--------------------------------------

In [125]:
# ---- Test with a new message (like TF-IDF style) ----
new_text = "WIN a FREE prize! Click here now."
tokens = [w for w in simple_preprocess(re.sub('[^a-zA-Z]', ' ', new_text).lower()) if w not in stop_words]
new_vec = np.mean([model.wv[w] for w in tokens if w in model.wv.key_to_index], axis=0).reshape(1, -1)
pred = clf.predict(new_vec)[0]
print("\n# New Text Prediction:", "SPAM üö®" if pred == 1 else "HAM ‚úÖ")



# New Text Prediction: SPAM üö®


In [128]:
# --- Import required libraries ---
import re                     # Regular expressions for text cleaning (e.g., removing punctuation/numbers)
import numpy as np             # For numerical computations and handling vectors
import pandas as pd            # For creating DataFrames (nice tabular results)
from gensim.utils import simple_preprocess   # Lightweight tokenizer & cleaner from gensim
from nltk.corpus import stopwords            # Stopword list from NLTK (common useless words like 'the', 'is')

# --- Prepare reusable stopword set ---
# Convert to a Python set for O(1) membership lookup time (much faster than list)
stop_words = set(stopwords.words('english'))


# ===============================
# 1Ô∏è‚É£  TEXT PREPROCESSING FUNCTION
# ===============================
def preprocess_text(text):
    """
    Cleans and tokenizes a given text string.
    Removes non-alphabetic characters, lowercases everything,
    tokenizes into words, and removes English stopwords.
    """
    # Replace all non-letter characters with a space (e.g., numbers, punctuation)
    cleaned = re.sub('[^a-zA-Z]', ' ', text).lower()
    
    # simple_preprocess() ‚Üí tokenizes, lowercases, and removes very short/long tokens automatically
    toks = simple_preprocess(cleaned)
    
    # Remove stopwords to keep only meaningful words
    toks = [w for w in toks if w not in stop_words]
    
    # Return the final list of clean tokens (words)
    return toks


# ================================================
# 2Ô∏è‚É£  FUNCTION: CONVERT TEXTS ‚Üí AVG WORD2VEC VECTORS
# ================================================
def texts_to_avgvecs(texts, model, fallback='zero'):
    """
    Converts a list of text strings into their average Word2Vec embeddings.

    Parameters:
        texts (list[str]): Raw text strings to convert.
        model (gensim.models.Word2Vec): Trained Word2Vec model with word embeddings.
        fallback (str): Strategy when no valid word vectors found:
                        'zero' = use zero-vector
                        'skip' = mark as None (some rows may be dropped later)
    
    Returns:
        np.ndarray of shape (n_texts, vector_size): Each row is an averaged word embedding.
    """

    vecs = []  # To store the resulting sentence vectors

    # Loop through every input text
    for t in texts:
        # Step 1: Clean and tokenize the text
        toks = preprocess_text(t)

        # Step 2: Collect the Word2Vec vectors for words that exist in model vocabulary
        word_vecs = [model.wv[w] for w in toks if w in model.wv.key_to_index]

        # Step 3: Handle case where none of the words exist in vocabulary
        if len(word_vecs) == 0:
            if fallback == 'zero':
                # Append a zero-vector (so dimensions remain consistent)
                vecs.append(np.zeros(model.vector_size))
            elif fallback == 'skip':
                # Append None (may cause alignment issues, so rarely used)
                vecs.append(None)
        else:
            # Step 4: Average all word vectors to form a single vector for this text
            vecs.append(np.mean(word_vecs, axis=0))

    # Step 5: Replace None values (if any) with zero-vectors to ensure consistent stacking
    final_vecs = [v if v is not None else np.zeros(model.vector_size) for v in vecs]

    # Step 6: Stack all vectors vertically to form a 2D NumPy array
    # Shape: (n_texts, vector_size)
    return np.vstack(final_vecs)


# ===================================
# 3Ô∏è‚É£  FUNCTION: BATCH PREDICT TEXTS
# ===================================
def batch_predict_texts(texts, model, clf, return_proba=True):
    """
    Predicts SPAM/HAM for a batch of texts using trained Word2Vec model and classifier.

    Parameters:
        texts (list[str]): Raw input messages to classify.
        model: Trained gensim Word2Vec model (used to compute embeddings).
        clf: Trained classifier (e.g., RandomForest, LogisticRegression).
        return_proba (bool): Whether to include probability scores (if model supports it).

    Returns:
        pd.DataFrame: Table containing text, predicted label, label name, and probability (if available).
    """

    # Step 1: Convert input texts to averaged Word2Vec vectors
    X_batch = texts_to_avgvecs(texts, model, fallback='zero')

    # Step 2: Predict class labels (0/1) using the trained classifier
    preds = clf.predict(X_batch)

    # Step 3: (Optional) Predict probabilities if classifier supports it and user requested it
    # - predict_proba() returns an array of shape (n_samples, n_classes)
    #   where each column = probability for that class
    probs = clf.predict_proba(X_batch) if return_proba and hasattr(clf, "predict_proba") else None

    # Step 4: Create a DataFrame to hold predictions and corresponding raw text
    df = pd.DataFrame({'text': texts, 'pred_label': preds})

    # Step 5: (Optional) Map numeric predictions back to string labels if LabelEncoder `le` exists
    try:
        # le.classes_ contains ['ham', 'spam'] and numeric mapping [0, 1]
        df['pred_name'] = df['pred_label'].map({i: name for i, name in enumerate(le.classes_)})
    except Exception:
        # If label encoder isn't defined, skip silently
        pass

    # Step 6: Add probability columns if they were computed
    if probs is not None:
        # Binary classification ‚Üí show only probability for class 1 (SPAM)
        if probs.shape[1] == 2:
            df['prob_spam'] = probs[:, 1]
        else:
            # Multi-class ‚Üí create one probability column per class
            for i in range(probs.shape[1]):
                df[f'prob_class_{i}'] = probs[:, i]

    # Step 7: Return the final result DataFrame
    return df


# ========================
# 4Ô∏è‚É£  EXAMPLE USAGE / DEMO
# ========================
batch = [
    "WIN a FREE prize! Click here now.",                  # obvious spam
    "Hey, are we meeting for lunch tomorrow?",            # normal message
    "Claim your reward: http://bit.ly/xyz",               # spam-like
    "Free entry in 2 a weekly competition to win tickets" # spam again
]

# Step 1: Predict labels + probabilities for this batch
result_df = batch_predict_texts(batch, model, clf)

# Step 2: Print the resulting table (each row = one input text)
print(result_df)


                                                text  pred_label pred_name  \
0                  WIN a FREE prize! Click here now.           1      spam   
1            Hey, are we meeting for lunch tomorrow?           0       ham   
2               Claim your reward: http://bit.ly/xyz           0       ham   
3  Free entry in 2 a weekly competition to win ti...           1      spam   

   prob_spam  
0   0.510000  
1   0.205000  
2   0.301667  
3   0.762500  
