### Importing the libraries

In [1]:
# Importing necessary libraries
## %pip install matplotlib
## %pip install wordcloud
## %pip install nltk
import numpy as np                              # For numerical operations
import matplotlib.pyplot as plt                 # For data visualization
%matplotlib inline
import pandas as pd                             # For data manipulation and analysis

from wordcloud import WordCloud                 # For generating word clouds

import nltk                                     # For natural language processing
from nltk.corpus import stopwords               # For accessing stopwords


# Downloading NLTK resources
nltk.download('punkt')                          # Downloading the punkt tokenizer
nltk.download('punkt_tab')                      # Downloading the punkt tokenizer for tab-separated text
nltk.download('stopwords')                      # Downloading stopwords


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ravi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Reading the Data

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/rkydx/datasets_repo/refs/heads/main/spam.csv')      # Reading the dataset from a URL raw GitHub link

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Data Preprocessing - Data Cleaning

In [3]:
# Drop unnecessary columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Rename columns for better clarity
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# check duplicate values
df.duplicated().sum()

np.int64(403)

In [6]:
len(df)

5572

In [7]:
# remove duplicate values
df = df.drop_duplicates(keep='first')
len(df)

5169

### Data Preprocessing - Encoding

In [8]:
#pip install scikit-learn
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()
# Encode the 'target' column
# Learns all unique values in the column (fit) and converts them into numbers (transform)
df['target'] = le.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Data Preprocessing - Feature Engineering

In [9]:
# Importing the Porter Stemmer for text stemming, which reduces words to their root form
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters and punctuation
import string

# Creating an instance of the Porter Stemmer
ps = PorterStemmer()

In [10]:
stop_words = stopwords.words('english')
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [11]:
# Lowercase transformation and text preprocessing function
def transform_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation and common special characters
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization - splitting text into individual words
    words = nltk.word_tokenize(text)
    
    # Stemming and removing stopwords
    y = []
    for word in words:
        if word not in stop_words:
            y.append(ps.stem(word))
    
    return " ".join(y)


In [12]:
transform_text("As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune")

'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun'

In [13]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# Applying the text transformation function to the 'text' column and creating a new column 'transformed_text'
df['transformed_text'] = df['text'].apply(transform_text)

In [15]:
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [16]:
# Feature Extraction using TF-IDF Vectorizer
# It converts text data into numerical format by calculating the importance of words in documents, considering their frequency and distribution.
# Higher weight to important words, lower weight to common words. 
# The resulting vectors can be used as input for machine learning models. 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Keep only the top 500 most important words from the whole dataset
tfid = TfidfVectorizer(max_features = 500)

In [17]:
# Fit and transform the 'transformed_text' column to create feature matrix X and target vector y
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values


### Data Preprocessing - Train-Test-Split

In [18]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

### Model Training
> Different models and is best for:
```
| Model                             | Category         | Best for                                       |
| --------------------------------- | ---------------- | ---------------------------------------------- |
| `LogisticRegression`              | Linear model     | Fast, works well for text classification       |
| `SVC` (Support Vector Classifier) | SVM              | Powerful for small/medium text data            |
| `MultinomialNB`                   | Na√Øve Bayes      | **Very good for text (spam classification)**   |
| `DecisionTreeClassifier`          | Tree model       | Simple but can overfit                         |
| `KNeighborsClassifier`            | Distance-based   | Not great for high-dim text data               |
| `RandomForestClassifier`          | Ensemble (trees) | Strong, stable, avoids overfitting             |
| `AdaBoostClassifier`              | Boosting         | Focuses on correcting errors                   |
| `BaggingClassifier`               | Ensemble         | Reduces variance, stable model                 |
| `ExtraTreesClassifier`            | Random Trees     | Faster than RandomForest                       |
| `GradientBoostingClassifier`      | Boosting         | Strong performance but slow                    |
| `XGBClassifier`                   | Extreme Boosting | **One of the best ML models for tabular data** |
```


In [None]:
# Imports of various machine learning models/algorithms
# %pip install xgboost
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [20]:
models = {
    #"SVC": SVC(kernel="sigmoid", gamma="1.0"),          
    "SVC": SVC(kernel="rbf", gamma="scale"),           # rbf is usually a better default
    "KNN": KNeighborsClassifier(),
    "MultinomialNB": MultinomialNB(),
    "DecisionTree": DecisionTreeClassifier(max_depth=5, random_state=2),
    "LogisticRegression": LogisticRegression(solver="liblinear", penalty="l1", random_state=2),
    "RandomForest": RandomForestClassifier(n_estimators=50, random_state=2, n_jobs=-1),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=2),
    "Bagging": BaggingClassifier(n_estimators=50, random_state=2, n_jobs=-1),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=50, random_state=2, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=50, random_state=2),
    "XGBoost": XGBClassifier(n_estimators=50, random_state=2, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
}

In [21]:
# Function to train and evaluate a classifier
def train_and_evaluate(clf, X_train, y_train, X_test, y_test):
    """Train one classifier and return metrics dict."""
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        # "recall": recall_score(y_test, y_pred, zero_division=0),
        # "f1": f1_score(y_test, y_pred, zero_division=0),
        # "confusion_matrix": confusion_matrix(y_test, y_pred)
    }
    return metrics

In [22]:
results = {}
for name, clf in models.items():
    try:
        print(f"\nTraining {name} ...")
        metrics = train_and_evaluate(clf, X_train, y_train, X_test, y_test)
        results[name] = metrics
        print(f"{name} - acc: {metrics['accuracy']:.4f}, prec: {metrics['precision']:.4f}")
        # print(f"{name} - acc: {metrics['accuracy']:.4f}, prec: {metrics['precision']:.4f}, recall: {metrics['recall']:.4f}, f1: {metrics['f1']:.4f}")
    except Exception as e:
        print(f"{name} failed: {e}")


Training SVC ...
SVC - acc: 0.9729, prec: 0.9825

Training KNN ...
KNN - acc: 0.9284, prec: 1.0000

Training MultinomialNB ...
MultinomialNB - acc: 0.9710, prec: 0.9655

Training DecisionTree ...
DecisionTree - acc: 0.9352, prec: 0.8989

Training LogisticRegression ...
LogisticRegression - acc: 0.9662, prec: 0.9478

Training RandomForest ...
RandomForest - acc: 0.9720, prec: 0.9504

Training AdaBoost ...
AdaBoost - acc: 0.9246, prec: 0.8750

Training Bagging ...
Bagging - acc: 0.9632, prec: 0.9032

Training ExtraTrees ...
ExtraTrees - acc: 0.9758, prec: 0.9449

Training GradientBoosting ...
GradientBoosting - acc: 0.9497, prec: 0.9388

Training XGBoost ...
XGBoost - acc: 0.9710, prec: 0.9500


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
