In [1]:
# ! pip install numpy
# !pip install pandas
# !pip install nltk
# !pip install scikit-learn
# !pip install gensim
# !pip install tqdm

## **Spam or Ham Classifier Project**

### **Importing the Dataset**

In [2]:
import pandas as pd
import numpy as np

messages = pd.read_csv(
    '../NLP-for-Transformers/Datum/SMSSpamCollection',
    sep='\t',
    names=['label', 'message']
)

messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
print(messages['message'],'\n')

print(messages['message'].loc[100],'\n')
print(messages['message'].loc[451])

'''
1. messages refers to the pandas DataFrame that contains the dataset.

2. ['message'] selects the "message" column from the DataFrame,
   which contains the SMS/text content.

3. .loc[451] selects the row with index label 451.

4. This returns the text message present at index 451 in the dataset.

5. It is commonly used to inspect or view a specific sample message
   from the corpus for understanding or debugging.
''';

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will Ã¼ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object 

Please don't text me anymore. I have nothing else to say. 

hanks lotsly!


### **Step-1: Text Preprocessing** 
    1. Tokenization, 
    2. StopWords, 
    3. Stemmming, 
    4. Lemmatization, 
    5. NLTK

In [4]:
# Data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # See the Capital letters

ps = PorterStemmer()

In [6]:
stop_words = set(stopwords.words('english')) # List to Set
'''
The final corpus contains all messages after:
    - Cleaning
    - Lowercasing
    - Stopword removal
    - Stemming
'''

corpus = [] # Group of Sentences
for i in range(len(messages)): # for each sentence in that message
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower().split()

    cleaned_words = [] # Group of Individual Words
    for word in review: # for each word in that sentence
        if word not in stop_words:
            s_word = ps.stem(word)
            cleaned_words.append(s_word) # Adds the Stemmed word to the Sentence List

    Sentence = ' '.join(cleaned_words) # Forming the sentance from Cleaned Words
    corpus.append(Sentence) # Adds the cleaned sentence to the corpus List 

corpus

'''
1. corpus = [] initializes an empty list to store the cleaned and processed text data.

2. stop_words stores the set of English stopwords from NLTK,
   which will be removed from the text during preprocessing.

3. The for loop iterates over each message in the dataset.

4. re.sub('[^a-zA-Z]', ' ', messages['message'][i])
   - Replaces all non-alphabetic characters with spaces.
   - Removes numbers, punctuation, and special symbols.

5. review.lower().split()
   - Converts text to lowercase.
   - Splits the sentence into individual words (tokens).

6. review1 = [] initializes a list to store processed words for the current message.

7. For each word in review:
   - If the word is not in stop_words,
   - Apply Porter stemming using ps.stem(word),
   - Append the stemmed word to review1.
''';

corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

#### **Applying OHE for the y lable**

In [7]:
y = pd.get_dummies(messages['label']) # Dummies --> OHE.
y = y.iloc[:,1] # selecting the 2nd column of the spam,ham

'''
0. get_dummies is a pandas function used to convert categorical values into numeric (one-hot encoded) columns.

1. pd.get_dummies(messages['label']) converts the text labels (like "ham" and "spam")
   into separate binary columns (one-hot encoding).

2. Each label becomes a column with values 0 or 1.
   Example:
      "ham"  â†’ [1, 0]
      "spam" â†’ [0, 1]

3. y.iloc[:, 1] selects the second column from the dummy DataFrame,
   which usually corresponds to the "spam" class.

4. .values converts the selected column into a NumPy array.

5. The final target vector y becomes:
      spam â†’ 1
      ham  â†’ 0

6. This numeric y is used as the output/label variable
   for training the machine learning classification model.
'''

# y = y.astype(int)
y.values

array([False, False,  True, ..., False, False, False], shape=(5572,))

### **Step-2: Text --> Vectors** 
**BoW, TF-IDF, Word2Vec, AvgWord2Vec â†’ all are text vectorization / embedding methods.**

    1.BoW, 
    2. TF-IDF, 
    3. Word2Vec, 
    4. AvgWord2Vec

### **Vectorization: (BoW, TF-IDF, Word2Vec, AvgWord2Vec)**

#### **1. Creating the Bag Of Words Model for Vectorization**

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Using CountVectorizer for BoW Vectors
cv = CountVectorizer(max_features = 2500) # Implement the Bag of Words
# cv = CountVectorizer(max_features = 2500, binary = True, ngram_range = (1,2))
X = cv.fit_transform(corpus)

X = X.toarray()

'''
0. These below lines implement the Bag of Words (BoW) model on your text data.

1. CountVectorizer is a tool from scikit-learn that converts raw text into 
numerical feature vectors using the Bag of Words (BoW) model.

2. This creates a CountVectorizer object.
    What it does:
    Converts text â†’ numeric vectors
    Builds a vocabulary from your corpus
    Represents each document by word counts

3. Keeps only the top 2500 most frequent words in the corpus.

4. x = cv.fit_transform(corpus)
This does two things:
    ðŸ”¹ fit(corpus)
        Scans all documents in corpus
        Learns the vocabulary (up to 2500 words)
        Assigns each word an index (column)

    ðŸ”¹ transform(corpus)
        Converts each document into a vector
        Each vector length = vocabulary size (â‰¤ 2500)
        Values = word counts in that document
'''

X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5572, 2500))

In [9]:
# Train Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [10]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train, y_train)

'''
1. from sklearn.naive_bayes import MultinomialNB
   - Imports the Multinomial Naive Bayes classifier from scikit-learn.
   - This algorithm is commonly used for text classification problems
     such as spam detection.

2. MultinomialNB()
   - Creates an instance of the Naive Bayes model.
   - It is designed to work with discrete features like
     word counts or TF-IDF values.

3. .fit(X_train, y_train)
   - Trains (fits) the model on the training data.
   - X_train contains the feature vectors for the messages.
   - y_train contains the corresponding labels
     (e.g., 0 = ham, 1 = spam).

4. spam_detect_model
   - Stores the trained classifier.
   - This model can now be used to make predictions on new data
     using spam_detect_model.predict().
''';

In [11]:
# Prediction

y_pred = spam_detect_model
y_pred = y_pred.predict(X_test)

'''
1. spam_detect_model.predict(X_test)
   - Uses the trained Naive Bayes model to make predictions.
   - X_test contains the feature vectors of unseen/test messages.
   - The model predicts the class label for each message.

2. The result is stored in y_pred.
   - y_pred is a NumPy array of predicted labels.
   - Each value represents the predicted class:
       0 â†’ ham (not spam)
       1 â†’ spam
''';

In [12]:
from sklearn.metrics import accuracy_score

accuracyScore = accuracy_score(y_test, y_pred)
print(accuracyScore)

'''
1. from sklearn.metrics import accuracy_score
   - Imports the accuracy_score function from scikit-learn.
   - It is used to evaluate classification models.

2. accuracy_score(y_test, y_pred)
   - Compares the true labels (y_test) with the predicted labels (y_pred).
   - Computes the accuracy as:
       (Number of correct predictions) / (Total predictions)

5. This accuracy indicates how well the spam detection model
   correctly classifies messages as spam or ham.
''';


0.9865470852017937


In [13]:
from sklearn.metrics import classification_report

classy_report = classification_report(y_test, y_pred)
print(classy_report)

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       955
        True       0.95      0.96      0.95       160

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



#### **2. Creating the TF-IDF Model for Vectorization**

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Using TfidfVectorizer for TF-IDF Vectors
tv = TfidfVectorizer(max_features = 2500)
# tv = TfidfVectorizer(max_features = 2500, ngram_range = (1,2))
X = tv.fit_transform(corpus)
# X.toarray()
# y

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [16]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train, y_train)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [17]:
y_pred = spam_detect_model
y_pred = y_pred.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score

accuracyScore = accuracy_score(y_test,y_pred)
print(accuracyScore)

0.979372197309417


In [19]:
from sklearn.metrics import classification_report

classy_report = classification_report(y_test,y_pred)
print(classy_report)

              precision    recall  f1-score   support

       False       0.98      1.00      0.99       955
        True       1.00      0.86      0.92       160

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



#### **Sample Usage of "RandomForestClassifier" insted of "NaiveBayes"**

#### **RandomForestClassifier**

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Initialising
rf = RandomForestClassifier()

# Training
rf.fit(X_train, y_train) # not fit_transform

# Predicting
y_pred = rf.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(acc)

clss = classification_report(y_test, y_pred)
print(clss)

0.9856502242152466
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       955
        True       1.00      0.90      0.95       160

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



### ðŸ§  **Word2Vec Implementation**

Word2Vec is a technique to learn dense vector representations (embeddings) for words based on their context.

It has **two main architectures**:
1. **Skip-gram** â†’ predicts surrounding context words given a target word.  
2. **CBOW (Continuous Bag of Words)** â†’ predicts the target word from surrounding context words.
---
1. **Pretrained Models**  
   - Example: **Google News Word2Vec (300-dimensional vectors)**  
   - Trained on massive corpora.  
   - Ready to use and often give strong results.  

2. **Fine-tuning / Training from Scratch**  
   - Train Word2Vec on your own dataset.  
   - Useful when domain-specific vocabulary is important  
     (e.g., medical, legal, social media text).  

#### **3. Creating the Word2Vec Model for Vectorization**

In [21]:
from gensim.models import Word2Vec

In [22]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [23]:
import gensim.downloader as api

wv = api.load("word2vec-google-news-300") # This is just Collecting the dataset

'''
1. import gensim.downloader as api, which provides access to pre-trained NLP models and datasets.

2. api.load("word2vec-google-news-300")
   - "word2vec-google-news-300" is trained on Google News data.

3. The model produces word vectors of size 300.
   - Each word is represented as a 300-dimensional dense vector.
   - These vectors capture semantic meaning (similar words have similar vectors).

4. w2v
   - Stores the loaded Word2Vec model.
   - You can access vectors like:
       w2v['king'], w2v['computer'], etc.
''';

In [24]:
vec_king = wv['king']
print(len(vec_king))
print(vec_king)

300
[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.4

In [25]:
from nltk.stem import WordNetLemmatizer # To simply use the Lemmatixation..!

lm = WordNetLemmatizer()

In [35]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess # simple_preprocess will simply convert to lowercase..!

words=[]
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

print(len(words))
words

5564


[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
  'prize',
  'reward',
  

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(words,window=5,min_count=2) # Here Vector_size=100 and epochs=5 are default values..!
model.wv.index_to_key

['call',
 'go',
 'get',
 'ur',
 'gt',
 'lt',
 'come',
 'day',
 'ok',
 'free',
 'know',
 'love',
 'like',
 'time',
 'good',
 'want',
 'got',
 'text',
 'send',
 'txt',
 'need',
 'one',
 'today',
 'take',
 'stop',
 'see',
 'home',
 'think',
 'repli',
 'lor',
 'sorri',
 'tell',
 'still',
 'mobil',
 'back',
 'da',
 'dont',
 'make',
 'phone',
 'pleas',
 'week',
 'hi',
 'say',
 'new',
 'work',
 'later',
 'pl',
 'hope',
 'miss',
 'ask',
 'co',
 'msg',
 'min',
 'meet',
 'dear',
 'night',
 'messag',
 'happi',
 'wait',
 'well',
 'give',
 'thing',
 'tri',
 'much',
 'great',
 'oh',
 'claim',
 'wat',
 'hey',
 'number',
 'thank',
 'friend',
 'way',
 'ye',
 'www',
 'let',
 'prize',
 'feel',
 'right',
 'even',
 'tomorrow',
 'win',
 'pick',
 'alreadi',
 'tone',
 'care',
 'said',
 'cash',
 'im',
 'amp',
 'leav',
 'yeah',
 'realli',
 'babe',
 'find',
 'life',
 'morn',
 'sleep',
 'last',
 'keep',
 'servic',
 'year',
 'uk',
 'nokia',
 'sure',
 'would',
 'buy',
 'anyth',
 'com',
 'use',
 'contact',
 'start',

In [None]:
print(model.corpus_count)
print(model.epochs)
print(model.wv.index_to_key) # wv is the pretrained model from google-300
print(model.wv['kid'])
print(model.wv['kid'].shape)

5564
5
['call', 'go', 'get', 'ur', 'gt', 'lt', 'come', 'day', 'ok', 'free', 'know', 'love', 'like', 'time', 'good', 'want', 'got', 'text', 'send', 'txt', 'need', 'one', 'today', 'take', 'stop', 'see', 'home', 'think', 'repli', 'lor', 'sorri', 'tell', 'still', 'mobil', 'back', 'da', 'dont', 'make', 'phone', 'pleas', 'week', 'hi', 'say', 'new', 'work', 'later', 'pl', 'hope', 'miss', 'ask', 'co', 'msg', 'min', 'meet', 'dear', 'night', 'messag', 'happi', 'wait', 'well', 'give', 'thing', 'tri', 'much', 'great', 'oh', 'claim', 'wat', 'hey', 'number', 'thank', 'friend', 'way', 'ye', 'www', 'let', 'prize', 'feel', 'right', 'even', 'tomorrow', 'win', 'pick', 'alreadi', 'tone', 'care', 'said', 'cash', 'im', 'amp', 'leav', 'yeah', 'realli', 'babe', 'find', 'life', 'morn', 'sleep', 'last', 'keep', 'servic', 'year', 'uk', 'nokia', 'sure', 'would', 'buy', 'anyth', 'com', 'use', 'contact', 'start', 'also', 'lol', 'everi', 'wish', 'urgent', 'look', 'smile', 'sent', 'end', 'show', 'someth', 'watch', 'a

In [29]:
print(model.wv.similar_by_word('hope'))

[('make', 0.9997361302375793), ('love', 0.9997239708900452), ('even', 0.9997162222862244), ('im', 0.999714732170105), ('well', 0.9997057318687439), ('realli', 0.999701976776123), ('much', 0.999701201915741), ('day', 0.9996955990791321), ('come', 0.9996946454048157), ('one', 0.999693751335144)]


#### **4. Creating the AvgWord2Vec Model for Vectorization**

In [30]:
def avg_word2vec(doc):
    """
    Compute the average Word2Vec embedding for a document.
    doc: list of tokens (words)
    """
    
    vectors = []  # to store word vectors found in the model
    
    for word in doc:
        # check if word exists in Word2Vec vocabulary
        if word in model.wv.index_to_key:
            vectors.append(model.wv[word])
    
    # if no words from doc are in vocabulary, return a zero vector
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    
    # compute and return the mean vector
    avg_vector = np.mean(vectors, axis=0)
    return avg_vector

**Summing along rows**

```
vectors =
[
  [0.2, 0.1, 0.4, 0.3, 0.5],
  [0.6, 0.2, 0.1, 0.4, 0.3],
  [0.5, 0.7, 0.2, 0.1, 0.4]
]
```

In [31]:
from tqdm import tqdm

print(words[73])
print(type(model.wv.index_to_key))


['perform']
<class 'list'>


In [32]:
# Apply for the entire sentences

X=[]
for i in tqdm(range(len(words))):
    # print("Hello",i)
    X.append(avg_word2vec(words[i]))

x.shape

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5564/5564 [00:00<00:00, 9196.06it/s]


NameError: name 'x' is not defined

In [None]:
# Words means list of words inside list sentences..!

words

In [None]:
X_new = np.array(X)
# Now we have X_new which has the weighted vectors for "AvgWord2Vec"..!  

print(type(X),'\n')

print(X_new[1],'\n')

print(X_new.shape)

### Train Test Split and Apply a Model to this..!