In this exercise, I will create a spam classifier. 
Take the data, do cleaning procedures on it like Tokenization, stemming, removing stopwords, and then test the f1_score with different models.

Download the Titanic dataset from 'https://www.kaggle.com/datasets/ganiyuolalekan/spam-assassin-email-classification-dataset'. 
Create a folder named 'datasets' in your Project Root. (ignore if already created)
Place the 'spam_assassin.csv' in the datasets folder.

In [1]:
import os
import pandas as pd

path = os.path.join("datasets", "spam_assassin.csv")
spam_ds = pd.read_csv(path)
spam_ds.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [6]:
import numpy as np
import nltk
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn .pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Splitting the feature and label
X = spam_ds["text"]
y = spam_ds["target"]

# Performing train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

nltk.download('punkt_tab')
nltk.download('stopwords')

# Creating a custom transformer for stemming
class TextStemming(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        cleaned_X = []
        for text in X:
            tokens = nltk.word_tokenize(str(text).lower())
            filtered = [self.stemmer.stem(word) for word in tokens if word.isalpha() and word not in self.stop_words]
            cleaned_X.append(" ".join(filtered))
        return np.array(cleaned_X)

# Pipeline Creation of TfidfVectorizer and TextStemming
pipeline = Pipeline([
    ('stems', TextStemming()),
    ('vector', TfidfVectorizer()),
])

# Fit and Transform
X_train_pipe = pipeline.fit_transform(X_train, y_train)

param_grid = {
    'n_neighbors':[1,2,3,4,5,6,7,8,9],
    'weights':['uniform', 'distance']
}

knc = KNeighborsClassifier()
gridcv = GridSearchCV(knc, param_grid = param_grid, cv = 5, scoring = 'f1')
gridcv.fit(X_train_pipe, y_train)

X_test_pipe = pipeline.transform(X_test)

y_pred = gridcv.predict(X_test_pipe)

f1_score(y_test, y_pred)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shoun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shoun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.9800796812749004

In [7]:
print(gridcv.best_params_)

{'n_neighbors': 4, 'weights': 'distance'}
