#Use supervised learning methods (Naive Bayes, Random Forest, GBM, XGBoost) to classify text messages (e.g., spam or ham) using embeddings like Word2Vec or GloVe.



1. Install Required Libraries


In [13]:
pip install pandas nltk scikit-learn gensim xgboost


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

2. Load and Inspect the Data

In [1]:
import pandas as pd
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load your dataset
df = pd.read_csv("spam.csv")  # replace with actual path

# Preprocessing setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return tokens

# Apply preprocessing to 'text' column
df['tokens'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


4. Train Word2Vec or GloVe


In [2]:
from gensim.models import Word2Vec

# Train Word2Vec model
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get average word vectors for a sentence
import numpy as np

def get_vector(tokens):
    vectors = [w2v_model.wv[token] for token in tokens if token in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

df['embedding'] = df['tokens'].apply(get_vector)


5. Prepare Train/Test Data


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode target variable
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])

# Stack embeddings into matrix
X = np.vstack(df['embedding'])
y = df['target']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

6. Train Classifiers

 **Naive Bayes**

In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes:\n", classification_report(y_test, y_pred_nb))


Naive Bayes:
               precision    recall  f1-score   support

         ham       0.93      0.42      0.58       965
        spam       0.18      0.80      0.29       150

    accuracy                           0.48      1115
   macro avg       0.55      0.61      0.44      1115
weighted avg       0.83      0.48      0.54      1115



**Random Forest**

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:\n", classification_report(y_test, y_pred_rf))


Random Forest:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.99      0.65      0.79       150

    accuracy                           0.95      1115
   macro avg       0.97      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



**Gradient Boosting Machine (GBM)**

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
y_pred_gbm = gbm.predict(X_test)
print("GBM:\n", classification_report(y_test, y_pred_gbm))


GBM:
               precision    recall  f1-score   support

         ham       0.94      0.99      0.96       965
        spam       0.90      0.59      0.71       150

    accuracy                           0.94      1115
   macro avg       0.92      0.79      0.84      1115
weighted avg       0.93      0.94      0.93      1115



**XGBoost**

In [10]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.



XGBoost:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       965
           1       0.92      0.73      0.82       150

    accuracy                           0.96      1115
   macro avg       0.94      0.86      0.90      1115
weighted avg       0.96      0.96      0.95      1115

