In [None]:
# !pip install pandas
# !pip install nltk
# !pip install scikit-learn

## **Spam or Ham Classifier Project**

### **Importing the Dataset**

In [None]:
import pandas as pd

messages = pd.read_csv(
    '../NLP-for-Transformers/Datum/SMSSpamCollection',
    sep='\t',
    names=['label', 'message']
)

messages

In [None]:
print(messages['message'],'\n')

print(messages['message'].loc[100],'\n')
print(messages['message'].loc[451])

'''
1. messages refers to the pandas DataFrame that contains the dataset.

2. ['message'] selects the "message" column from the DataFrame,
   which contains the SMS/text content.

3. .loc[451] selects the row with index label 451.

4. This returns the text message present at index 451 in the dataset.

5. It is commonly used to inspect or view a specific sample message
   from the corpus for understanding or debugging.
''';

### **Step-1: Text Preprocessing** 
    1. Tokenization, 
    2. StopWords, 
    3. Stemmming, 
    4. Lemmatization, 
    5. NLTK

In [None]:
# Data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # See the Capital letters

ps = PorterStemmer()

In [None]:
corpus = []

stop_words = set(stopwords.words('english'))

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower().split()

    review1 = []
    for word in review:
        if word not in stop_words:
            review1.append(ps.stem(word))

    review = ' '.join(review1)
    corpus.append(review)

corpus

### **Step-2: Text --> Vectors** 
    1.BoW, 
    2. TF-IDF, 
    3. Word2Vec, 
    4. AvgWord2Vec

### **Creating the Bag of Words model**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 2500)
x = cv.fit_transform(corpus)

x = x.toarray()

'''
1. These below lines implement the Bag of Words (BoW) model on your text data.

2. This creates a CountVectorizer object.
    What it does:
    Converts text â†’ numeric vectors
    Builds a vocabulary from your corpus
    Represents each document by word counts

3. Keeps only the top 2500 most frequent words in the corpus.

4. x = cv.fit_transform(corpus)
This does two things:
    ðŸ”¹ fit(corpus)
        Scans all documents in corpus
        Learns the vocabulary (up to 2500 words)
        Assigns each word an index (column)

    ðŸ”¹ transform(corpus)
        Converts each document into a vector
        Each vector length = vocabulary size (â‰¤ 2500)
        Values = word counts in that document
''';

In [None]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

'''
1. pd.get_dummies(messages['label']) converts the text labels (like "ham" and "spam")
   into separate binary columns (one-hot encoding).

2. Each label becomes a column with values 0 or 1.
   Example:
      "ham"  â†’ [1, 0]
      "spam" â†’ [0, 1]

3. y.iloc[:, 1] selects the second column from the dummy DataFrame,
   which usually corresponds to the "spam" class.

4. .values converts the selected column into a NumPy array.

5. The final target vector y becomes:
      spam â†’ 1
      ham  â†’ 0

6. This numeric y is used as the output/label variable
   for training the machine learning classification model.
'''