In [89]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.utils import simple_preprocess # Covert a documements into a list of lowecase tokens, ignoring tokens that are too short or too long
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize 
np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))


## ⚙️ What is tqdm?
- `tqdm` is a progress bar library in Python.
- It helps you visually track the progress of loops 
- especially helpful when processing large datasets, training models, or running time-consuming tasks.

In [90]:
!pip install tqdm



In [91]:
from tqdm import tqdm

In [92]:
# If needed (run once in environment):
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

# Read the SMS Spam Collection file into a DataFrame.
# File expected to be tab-separated with two columns: label and message.
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'message'])

In [93]:
# ---- Build stopwords ONCE ----
# Get the set of English stopwords from NLTK. Converting to a set
# gives O(1) lookups during token filtering and avoids rebuilding it
# inside the loop for every message.
stop_words = set(stopwords.words('english'))

In [94]:
# Corpus will hold cleaned text strings (one per message).
corpus = []
for i in tqdm(range(len(messages)), desc="Preparing Corpus Data", mininterval=1.0):
    # 1) Remove all non-letter characters and lowercase the text.
    #    This removes numbers, punctuation, URLs, etc., leaving only letters and spaces.
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i]).lower()

    # 2) Tokenize the cleaned text into word tokens.
    #    word_tokenize splits on whitespace and punctuation (we already removed most punctuation).
    tokens = word_tokenize(review)

    # 3) Filter tokens:
    #    - keep alphabetic tokens only (filter out residual tokens like empty strings),
    #    - remove stopwords to reduce noise (common words that don't help classification).
    #    This is done once per message; using the prebuilt stop_words set is efficient.
    filtered_words = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 4) Join the filtered tokens back into a single cleaned string and append to corpus.
    #    The vectorizer later expects raw strings (one per sample).
    corpus.append(' '.join(filtered_words))

Preparing Corpus Data: 100%|██████████| 5572/5572 [00:00<00:00, 26713.97it/s]


In [95]:
# Outer loop: for c in corpus → iterates over each document/sentence in the corpus
# Inner loop: for t in sent_tokenize(c) → tokenizes each document into sentences
# Expression: simple_preprocess(t) → processes each sentence
# All results are appended directly into the words list

words = [simple_preprocess(t) for c in tqdm(corpus, desc="Cleaning-up Corpus") for t in sent_tokenize(c)]

words


Cleaning-up Corpus: 100%|██████████| 5572/5572 [00:00<00:00, 143216.63it/s]


[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'goes', 'usf', 'lives', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aids', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'callers',
  'press',
  'copy',
  'friends',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receiv

## Train Word2Vec from scratch

In [96]:
model = gensim.models.Word2Vec(words, vector_size=100)

To get all the Vocabulary

In [97]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'ok',
 'go',
 'free',
 'know',
 'got',
 'good',
 'like',
 'day',
 'come',
 'time',
 'love',
 'send',
 'want',
 'text',
 'txt',
 'one',
 'going',
 'need',
 'home',
 'stop',
 'lor',
 'sorry',
 'today',
 'see',
 'still',
 'back',
 'da',
 'dont',
 'reply',
 'mobile',
 'hi',
 'take',
 'tell',
 'new',
 'later',
 'please',
 'pls',
 'think',
 'phone',
 'week',
 'dear',
 'well',
 'night',
 'much',
 'oh',
 'great',
 'hope',
 'msg',
 'claim',
 'hey',
 'na',
 'happy',
 'wat',
 'give',
 'way',
 'yes',
 'make',
 'work',
 'www',
 'number',
 'message',
 'wan',
 'tomorrow',
 'prize',
 'say',
 'right',
 'already',
 'said',
 'ask',
 'amp',
 'cash',
 'im',
 'yeah',
 'really',
 'life',
 'win',
 'find',
 'meet',
 'miss',
 'babe',
 'morning',
 'let',
 'last',
 'thanks',
 'cos',
 'would',
 'anything',
 'com',
 'uk',
 'also',
 'nokia',
 'lol',
 'care',
 'every',
 'pick',
 'sure',
 'keep',
 'sent',
 'min',
 'urgent',
 'something',
 'contact',
 'gud',
 'buy',
 'us',
 'cant'

To get the count of the Vocabulary

In [98]:
model.corpus_count

5564

`model.epochs` means how many times the training algorithm will iterate over the entire corpus during training.

In [99]:
# 
model.epochs



5

So, if `epochs=5`, the model will process every sentence 5 times, refining word vectors after each full pass.

`model.wv.similar_by_word('great')` returns the most similar words to the given word (here 'great') based on cosine similarity between word vectors.


In [100]:
model.wv.similar_by_word('great')


[('one', 0.9995867013931274),
 ('see', 0.9995717406272888),
 ('got', 0.999566912651062),
 ('think', 0.999565064907074),
 ('work', 0.9995591640472412),
 ('keep', 0.9995591044425964),
 ('like', 0.9995549917221069),
 ('today', 0.9995527863502502),
 ('go', 0.999552309513092),
 ('make', 0.9995503425598145)]

In [101]:
model.wv.similar_by_word('great', topn=20)

[('one', 0.9995867013931274),
 ('see', 0.9995717406272888),
 ('got', 0.999566912651062),
 ('think', 0.999565064907074),
 ('work', 0.9995591640472412),
 ('keep', 0.9995591044425964),
 ('like', 0.9995549917221069),
 ('today', 0.9995527863502502),
 ('go', 0.999552309513092),
 ('make', 0.9995503425598145),
 ('haha', 0.9995497465133667),
 ('much', 0.9995446801185608),
 ('always', 0.9995304346084595),
 ('know', 0.9995284080505371),
 ('new', 0.9995276927947998),
 ('good', 0.9995197057723999),
 ('also', 0.9995154142379761),
 ('try', 0.9995145201683044),
 ('back', 0.9995134472846985),
 ('msg', 0.9995107054710388)]

| Parameter | Description                                                       |
| --------- | ----------------------------------------------------------------- |
| `word`    | The word you want to find similar terms for.                      |
| `topn`    | (optional) Number of most similar words to return (default = 10). |

In [102]:
model.wv['good']

array([-0.28, 0.434, 0.109, 0.0591, 0.0277, -0.677, 0.28, 0.89, -0.247, -0.194, -0.357, -0.489, -0.139, 0.0718, 0.209, -0.443, 0.0755, -0.582, 0.041, -0.896, 0.27, 0.208, 0.208, -0.11, -0.165, 0.0444, -0.362, -0.424, -0.365, 0.303, 0.415, -0.0978, 0.275, -0.322, -0.195, 0.391, -0.165, -0.381, -0.298, -0.828, 0.109, -0.297, -0.0515, 0.12, 0.268, -0.295, -0.284, -0.0261, 0.303, 0.411, 0.197, -0.414, -0.188, 0.0128, -0.319, 0.186, 0.177, -0.185, -0.602, 0.186, 0.155, 0.0996, 0.0148, -0.122, -0.579, 0.298, 0.226, 0.36, -0.559, 0.585, -0.262, 0.21, 0.532, -0.099, 0.487, 0.231, -0.068, -0.0781, -0.476, 0.294, -0.156, 0.0608, -0.48, 0.705, -0.0549, 0.00433, 0.00182, 0.563, 0.58, 0.0307, 0.585, 0.176, 0.0598, 0.0551, 0.632, 0.472, 0.307, -0.559, 0.338, -0.169], dtype=float32)

In [103]:
model.wv['good'].shape

(100,)

## 🔹 What is AvgWord2Vec?
- When using Word2Vec, each word is represented by a vector. 
- But for many NLP tasks (like sentiment analysis, classification, etc.), you need a single vector per sentence or document — not per word.

👉 AvgWord2Vec (Average Word2Vec) solves this by averaging all word vectors in a sentence/document.


In [104]:
# Function to compute AvgWord2Vec for a sentence
def avg_word2vec(words, model):
    vectors = [model.wv[w] for w in words if w in model.wv.key_to_index]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [105]:
# Wrap your iterable with tqdm to visualize progress
# avg_vectors will be independency features
avg_vectors = np.array([avg_word2vec(word, model) for word in tqdm(words, desc="Computing AvgWord2Vec")])
avg_vectors

Computing AvgWord2Vec: 100%|██████████| 5564/5564 [00:00<00:00, 53895.82it/s]


array([[-0.157, 0.231, 0.0545, 0.0379, 0.014, -0.358, 0.155, 0.471, -0.133, -0.101, -0.192, -0.261, -0.0806, 0.047, 0.111, -0.237, 0.0436, -0.311, 0.0195, -0.478, 0.148, 0.115, 0.107, -0.0593, -0.0814, 0.0204, -0.194, -0.224, -0.196, 0.159, ..., -0.144, 0.108, 0.285, -0.0615, 0.26, 0.118, -0.0404, -0.0345, -0.251, 0.152, -0.0813, 0.0245, -0.261, 0.372, -0.021, 0.00169, 0.00176, 0.294, 0.311, 0.0156, 0.306, 0.0912, 0.0336, 0.026, 0.342, 0.253, 0.158, -0.296, 0.182, -0.0853],
       [-0.136, 0.196, 0.0467, 0.0324, 0.0118, -0.305, 0.134, 0.399, -0.119, -0.0885, -0.168, -0.223, -0.0753, 0.0293, 0.093, -0.204, 0.0375, -0.265, 0.0165, -0.41, 0.124, 0.0984, 0.0958, -0.0502, -0.0722, 0.0151, -0.168, -0.196, -0.168, 0.143, ..., -0.122, 0.0904, 0.249, -0.05, 0.222, 0.106, -0.0403, -0.0267, -0.213, 0.129, -0.0636, 0.0233, -0.224, 0.312, -0.0177, -0.000138, 0.00194, 0.253, 0.265, 0.014, 0.267, 0.0776, 0.0262, 0.0205, 0.291, 0.221, 0.131, -0.258, 0.157, -0.0728],
       [-0.166, 0.249, 0.0597, 0.04

Each vector will be of 100 dimensions 

In [106]:
avg_vectors[0].shape

(100,)

Dependent feature are my output feature

In [107]:
# 1) Label encoding: convert "ham"/"spam" to numeric values (0/1)
#    - Input: messages DataFrame (must have 'label' column)
#    - Output: y (numpy array of 0/1)
from sklearn.preprocessing import LabelEncoder

# 1) Initialize label encoder
labelencoder = LabelEncoder()

# 2) Fit and transform the 'label' column
#    - 'ham' -> 0, 'spam' -> 1 (confirm mapping after transformation)
y = labelencoder.fit_transform(messages['label'])

# 3) Print mapping for clarity
mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print("# 1) Label mapping:", mapping)
# 4) Optional quick sanity check
print("# 2) Example labels (first 10):", y[:10])

print(messages.shape)
print(avg_vectors.shape)
print(y.shape)

# 1) Label mapping: {'ham': np.int64(0), 'spam': np.int64(1)}
# 2) Example labels (first 10): [0 0 1 0 0 1 0 0 1 1]
(5572, 2)
(5564, 100)
(5572,)


Here we have a problem:
- Original data set has 5572 rows and 2 columns
- In the compiled data we have 5564 and 100 columns 
- Output feature/dependent feature is also having 5572

This mismatch happened because some records got cleaned out (empty after stopword removal).

In [108]:
[[i, j, k] for i, j, k in zip(map(len, corpus), corpus, messages['message']) if i < 1]

[[0, '', 'What you doing?how are you?'],
 [0, '', 'Where @'],
 [0, '', '645'],
 [0, '', 'Can a not?'],
 [0, '', ':) '],
 [0, '', 'What you doing?how are you?'],
 [0, '', ':( but your not here....'],
 [0, '', ':-) :-)']]

Aforementioned words were removed during preprocessing

What happened here? 
 - During preprocessing after applying stop words and regex, these data are cleaned up, which is why we’re seeing empty corpus entries.


Now we need to find out with respect to `y` if the length is less than 1

In [109]:
# Now we need to find out with respect to `y` if the length is less than 1
dependent_feature = messages[list(map(lambda x: len(x) > 0, corpus))]
y = labelencoder.fit_transform(dependent_feature['label'])
y.shape

(5564,)

In [110]:
X = [avg_word2vec(word, model) for word in tqdm(words, desc="Computing AvgWord2Vec")]

Computing AvgWord2Vec: 100%|██████████| 5564/5564 [00:00<00:00, 96008.64it/s]


final independent feature 

In [111]:
## final independent feature 
data_frame = pd.DataFrame([X[i].reshape(-1) for i in tqdm(range(len(X)), desc='Preparing final independent feature')])
data_frame['Output'] = y
data_frame.dropna(axis=0, inplace=True)

data_frame

Preparing final independent feature: 100%|██████████| 5564/5564 [00:00<00:00, 3075528.13it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.157334,0.231478,0.054548,0.037908,0.013971,-0.358092,0.154592,0.471237,-0.133182,-0.100510,...,0.091194,0.033592,0.025964,0.341959,0.252728,0.157875,-0.296483,0.182414,-0.085294,0
1,-0.136302,0.196414,0.046676,0.032406,0.011828,-0.304715,0.133966,0.398854,-0.118595,-0.088550,...,0.077604,0.026193,0.020480,0.290560,0.221192,0.130648,-0.257536,0.157145,-0.072819,0
2,-0.166107,0.248569,0.059688,0.049280,0.012988,-0.390098,0.171635,0.506940,-0.147730,-0.115899,...,0.094544,0.036307,0.027408,0.362933,0.271190,0.162409,-0.332157,0.192313,-0.088785,1
3,-0.222099,0.338481,0.078117,0.055345,0.018697,-0.515366,0.221863,0.680702,-0.196731,-0.143107,...,0.130452,0.044453,0.028128,0.494617,0.365457,0.226410,-0.435598,0.259261,-0.126049,0
4,-0.136360,0.204481,0.044003,0.033497,0.014224,-0.313901,0.134687,0.407574,-0.115615,-0.087248,...,0.078106,0.026717,0.018206,0.295643,0.217678,0.138141,-0.256968,0.153422,-0.076208,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5559,-0.190687,0.280248,0.069520,0.051760,0.015692,-0.442797,0.192259,0.561211,-0.169946,-0.132113,...,0.109515,0.046782,0.026222,0.411504,0.310802,0.178489,-0.371609,0.216841,-0.096981,1
5560,-0.194991,0.285340,0.071474,0.046796,0.011904,-0.436880,0.185010,0.581288,-0.165172,-0.124629,...,0.109004,0.037863,0.023428,0.418045,0.313544,0.196232,-0.364211,0.217398,-0.102199,0
5561,-0.071459,0.092147,0.021687,0.013530,-0.001330,-0.155038,0.064328,0.195680,-0.059198,-0.032733,...,0.042645,0.004502,0.011602,0.149328,0.101702,0.073114,-0.117785,0.069379,-0.044173,0
5562,-0.177007,0.268251,0.065754,0.053271,0.016157,-0.416399,0.181654,0.544188,-0.155706,-0.124629,...,0.107582,0.039769,0.033074,0.393773,0.294817,0.178323,-0.348449,0.202476,-0.099983,0


In [112]:
# ---- Train-test split (keep RAW TEXTS) ----
X = data_frame
y = data_frame['Output']
data_frame = data_frame.drop('Output', axis=1, inplace=True)

from sklearn.model_selection import train_test_split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [113]:
# Print sizes to verify split
print(f"# raw train size: {len(X_train_raw)}")
print(f"# raw test  size: {len(X_test_raw)}")

# raw train size: 4451
# raw test  size: 1113


In [114]:
X.isnull().sum()

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64

In [115]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_raw, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict on the test set and evaluate performance
y_pred = classifier.predict(X_test_raw)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"# Test Accuracy: {acc:.4f}")
print("# Confusion Matrix (rows=true, cols=pred):\n", cm)

print("# Classification Report (named classes):")
print(classification_report(y_test, y_pred, target_names=labelencoder.classes_, zero_division=0))

# ---- Show some TEST samples with predictions ----
print("\n# Sample Test Predictions")
print("------------------------")
y_test_arr = y_test.values  # numpy array aligned with X_test_raw.iloc order if split used arrays

N = min(20, len(X_test_raw))  # limit output to at most 20 samples
for i in range(N):
    # Use the raw cleaned string for display
    text = X_test_raw[i]

    # Extract the corresponding 2D row from the sparse test matrix so it can be fed to predict().
    # Keep shape (1, n_features) with slicing [i:i+1].
    vec = X_test_raw.iloc[i:i+1]

    # Predict single sample and map numeric class to a human-friendly label.
    pred = classifier.predict(vec)[0]
    pred_label = 'SPAM 🚨' if pred == 1 else 'HAM ✅'
    true_label = 'SPAM' if y_test_arr[i] == 1 else 'HAM'

    # Print a truncated text for readability, and the prediction vs actual
    print(f"Text: {text[:80]}...")
    print(f"Pred : {pred_label} | Actual: {true_label}")
    print("-" * 80)

# Test Accuracy: 0.9641
# Confusion Matrix (rows=true, cols=pred):
 [[956   8]
 [ 32 117]]
# Classification Report (named classes):
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       964
        spam       0.94      0.79      0.85       149

    accuracy                           0.96      1113
   macro avg       0.95      0.89      0.92      1113
weighted avg       0.96      0.96      0.96      1113


# Sample Test Predictions
------------------------
Text: 3816   -0.225146
591    -0.159704
2164   -0.136492
3953   -0.223628
1536   -0.108854
          ...   
3864   -0.227368
1362   -0.222385
4807   -0.258006
995    -0.189272
3260   -0.212906
Name: 0, Length: 80, dtype: float64...
Pred : HAM ✅ | Actual: HAM
--------------------------------------------------------------------------------
Text: 3816    0.336201
591     0.231810
2164    0.205612
3953    0.335744
1536    0.166236
          ...   
3864    0.335488
1362    0.323332
4807   

In [117]:
# ---- Test with a new message (like TF-IDF style) ----
new_text = "WIN a FREE prize! Click here now."
tokens = [w for w in simple_preprocess(re.sub('[^a-zA-Z]', ' ', new_text).lower()) if w not in stop_words]
new_vec = np.mean([model.wv[w] for w in tokens if w in model.wv.key_to_index], axis=0).reshape(1, -1)
pred = classifier.predict(new_vec)[0]
print("\n# New Text Prediction:", "SPAM 🚨" if pred == 1 else "HAM ✅")



# New Text Prediction: SPAM 🚨
