In [None]:
# ! pip install numpy
# !pip install pandas
# !pip install nltk
# !pip install scikit-learn
# !pip install gensim

## **Spam or Ham Classifier Project**

### **Importing the Dataset**

In [None]:
import pandas as pd

messages = pd.read_csv(
    '../NLP-for-Transformers/Datum/SMSSpamCollection',
    sep='\t',
    names=['label', 'message']
)

messages

In [None]:
print(messages['message'],'\n')

print(messages['message'].loc[100],'\n')
print(messages['message'].loc[451])

'''
1. messages refers to the pandas DataFrame that contains the dataset.

2. ['message'] selects the "message" column from the DataFrame,
   which contains the SMS/text content.

3. .loc[451] selects the row with index label 451.

4. This returns the text message present at index 451 in the dataset.

5. It is commonly used to inspect or view a specific sample message
   from the corpus for understanding or debugging.
''';

### **Step-1: Text Preprocessing** 
    1. Tokenization, 
    2. StopWords, 
    3. Stemmming, 
    4. Lemmatization, 
    5. NLTK

In [None]:
# Data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # See the Capital letters

ps = PorterStemmer()

In [None]:
stop_words = set(stopwords.words('english')) # List to Set
'''
The final corpus contains all messages after:
    - Cleaning
    - Lowercasing
    - Stopword removal
    - Stemming
'''

corpus = [] # Group of Sentences
for i in range(len(messages)): # for each sentence in that message
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower().split()

    cleaned_words = [] # Group of Individual Words
    for word in review: # for each word in that sentence
        if word not in stop_words:
            s_word = ps.stem(word)
            cleaned_words.append(s_word) # Adds the Stemmed word to the Sentence List

    Sentence = ' '.join(cleaned_words) # Forming the sentance from Cleaned Words
    corpus.append(Sentence) # Adds the cleaned sentence to the corpus List 

corpus

'''
1. corpus = [] initializes an empty list to store the cleaned and processed text data.

2. stop_words stores the set of English stopwords from NLTK,
   which will be removed from the text during preprocessing.

3. The for loop iterates over each message in the dataset.

4. re.sub('[^a-zA-Z]', ' ', messages['message'][i])
   - Replaces all non-alphabetic characters with spaces.
   - Removes numbers, punctuation, and special symbols.

5. review.lower().split()
   - Converts text to lowercase.
   - Splits the sentence into individual words (tokens).

6. review1 = [] initializes a list to store processed words for the current message.

7. For each word in review:
   - If the word is not in stop_words,
   - Apply Porter stemming using ps.stem(word),
   - Append the stemmed word to review1.
''';

corpus

#### **Applying OHE for the y lable**

In [None]:
y = pd.get_dummies(messages['label']) # Dummies --> OHE.
y = y.iloc[:,1] # selecting the 2nd column of the spam,ham

'''
0. get_dummies is a pandas function used to convert categorical values into numeric (one-hot encoded) columns.

1. pd.get_dummies(messages['label']) converts the text labels (like "ham" and "spam")
   into separate binary columns (one-hot encoding).

2. Each label becomes a column with values 0 or 1.
   Example:
      "ham"  â†’ [1, 0]
      "spam" â†’ [0, 1]

3. y.iloc[:, 1] selects the second column from the dummy DataFrame,
   which usually corresponds to the "spam" class.

4. .values converts the selected column into a NumPy array.

5. The final target vector y becomes:
      spam â†’ 1
      ham  â†’ 0

6. This numeric y is used as the output/label variable
   for training the machine learning classification model.
'''

# y = y.astype(int)
y.values

### **Step-2: Text --> Vectors** 
    1.BoW, 
    2. TF-IDF, 
    3. Word2Vec, 
    4. AvgWord2Vec

### **Vectorization: (BoW / TF-IDF)**

#### **1. Creating the Bag Of Words Model for Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Using CountVectorizer for BoW Vectors
cv = CountVectorizer(max_features = 2500) # Implement the Bag of Words
# cv = CountVectorizer(max_features = 2500, binary = True, ngram_range = (1,2))
X = cv.fit_transform(corpus)

X = X.toarray()

'''
0. These below lines implement the Bag of Words (BoW) model on your text data.

1. CountVectorizer is a tool from scikit-learn that converts raw text into 
numerical feature vectors using the Bag of Words (BoW) model.

2. This creates a CountVectorizer object.
    What it does:
    Converts text â†’ numeric vectors
    Builds a vocabulary from your corpus
    Represents each document by word counts

3. Keeps only the top 2500 most frequent words in the corpus.

4. x = cv.fit_transform(corpus)
This does two things:
    ðŸ”¹ fit(corpus)
        Scans all documents in corpus
        Learns the vocabulary (up to 2500 words)
        Assigns each word an index (column)

    ðŸ”¹ transform(corpus)
        Converts each document into a vector
        Each vector length = vocabulary size (â‰¤ 2500)
        Values = word counts in that document
'''

X

In [None]:
# Train Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train, y_train)

'''
1. from sklearn.naive_bayes import MultinomialNB
   - Imports the Multinomial Naive Bayes classifier from scikit-learn.
   - This algorithm is commonly used for text classification problems
     such as spam detection.

2. MultinomialNB()
   - Creates an instance of the Naive Bayes model.
   - It is designed to work with discrete features like
     word counts or TF-IDF values.

3. .fit(X_train, y_train)
   - Trains (fits) the model on the training data.
   - X_train contains the feature vectors for the messages.
   - y_train contains the corresponding labels
     (e.g., 0 = ham, 1 = spam).

4. spam_detect_model
   - Stores the trained classifier.
   - This model can now be used to make predictions on new data
     using spam_detect_model.predict().
''';

In [None]:
# Prediction

y_pred = spam_detect_model
y_pred = y_pred.predict(X_test)

'''
1. spam_detect_model.predict(X_test)
   - Uses the trained Naive Bayes model to make predictions.
   - X_test contains the feature vectors of unseen/test messages.
   - The model predicts the class label for each message.

2. The result is stored in y_pred.
   - y_pred is a NumPy array of predicted labels.
   - Each value represents the predicted class:
       0 â†’ ham (not spam)
       1 â†’ spam
''';

In [None]:
from sklearn.metrics import accuracy_score

accuracyScore = accuracy_score(y_test, y_pred)
print(accuracyScore)

'''
1. from sklearn.metrics import accuracy_score
   - Imports the accuracy_score function from scikit-learn.
   - It is used to evaluate classification models.

2. accuracy_score(y_test, y_pred)
   - Compares the true labels (y_test) with the predicted labels (y_pred).
   - Computes the accuracy as:
       (Number of correct predictions) / (Total predictions)

5. This accuracy indicates how well the spam detection model
   correctly classifies messages as spam or ham.
''';


In [None]:
from sklearn.metrics import classification_report

classy_report = classification_report(y_test, y_pred)
print(classy_report)

#### **2. Creating the TF-IDF Model for Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Using TfidfVectorizer for TF-IDF Vectors
tv = TfidfVectorizer(max_features = 2500)
# tv = TfidfVectorizer(max_features = 2500, ngram_range = (1,2))
X = tv.fit_transform(corpus)
# X.toarray()
# y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train, y_train)

In [None]:
y_pred = spam_detect_model
y_pred = y_pred.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracyScore = accuracy_score(y_test,y_pred)
print(accuracyScore)

In [None]:
from sklearn.metrics import classification_report

classy_report = classification_report(y_test,y_pred)
print(classy_report)

#### **Sample Usage of "RandomForestClassifier" insted of "NaiveBayes"**

#### **RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialising
rf = RandomForestClassifier()

# Training
rf.fit(X_train, y_train) # not fit_transform

# Predicting
y_pred = rf.predict(X_test)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(acc)

clss = classification_report(y_test, y_pred)
print(clss)

### ðŸ§  **Word2Vec Implementation**

Word2Vec is a technique to learn dense vector representations (embeddings) for words based on their context.

It has **two main architectures**:
1. **Skip-gram** â†’ predicts surrounding context words given a target word.  
2. **CBOW (Continuous Bag of Words)** â†’ predicts the target word from surrounding context words.
---
1. **Pretrained Models**  
   - Example: **Google News Word2Vec (300-dimensional vectors)**  
   - Trained on massive corpora.  
   - Ready to use and often give strong results.  

2. **Fine-tuning / Training from Scratch**  
   - Train Word2Vec on your own dataset.  
   - Useful when domain-specific vocabulary is important  
     (e.g., medical, legal, social media text).  

In [None]:
import gensim.downloader as api

