In [8]:
!pip install pandas numpy scikit-learn matplotlib -q

In [51]:
!python -m spacy download en_core_web_sm -q

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [89]:
class SpamClassifier:
    def __init__(self, data_path, random_state=42):
        self.y_test = None
        self.y_train = None
        self.X_test = None
        self.X_train = None
        self.y = None
        self.X = None
        self.data = pd.read_csv(data_path)
        self.nlp = spacy.load("en_core_web_sm")
        self.random_state = random_state

    def preprocess_data(self):
        self.data.dropna(subset=['email'], inplace=True)

        self.data['cleaned_text'] = self.data['email'].apply(
            lambda x: ' '.join(
                token.lemma_.lower() for token in self.nlp(x) if
                not token.is_stop
                and not token.is_punct
                and not token.is_digit
                and not token.like_email
                and not token.like_num
                and not token.is_space
            )
        )

        self.X = self.data['cleaned_text']
        self.y = self.data['label']

    def split_data(self, test_size=0.2):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=self.random_state
        )

    def train_and_evaluate_models(self, vectorizer_type):
        if vectorizer_type not in ["CountVectorizer", "TfidfVectorizer"]:
            raise ValueError("Invalid vectorizer type. Use 'CountVectorizer' or 'TfidfVectorizer'.")

        if vectorizer_type == "CountVectorizer":
            vectorizer = CountVectorizer()
            param_grid = {
                'vectorizer__max_df': np.linspace(0.3, 0.7, 10),
                'vectorizer__min_df': [0.0, 0.001, 0.003, 0.005],
                'vectorizer__ngram_range': ((1, 1), (1, 2))
            }
        elif vectorizer_type == "TfidfVectorizer":
            vectorizer = TfidfVectorizer()
            param_grid = {
                "vectorizer__norm": ("l1", "l2"),
            }
        else:
            raise ValueError("Invalid vectorizer.")

        pipelines = {
            "DecisionTree": Pipeline([
                ('vectorizer', vectorizer),
                ('clf', DecisionTreeClassifier(random_state=self.random_state))
            ]),
            "LogisticRegression": Pipeline([
                ('vectorizer', vectorizer),
                ('clf', LogisticRegression(max_iter=1000, random_state=self.random_state))
            ]),
            "NaiveBayes": Pipeline([
                ('vectorizer', vectorizer),
                ('clf', MultinomialNB())
            ])
        }

        results = []

        for model_type, pipeline in pipelines.items():
            result = {"Model": model_type, "Vectorizer": vectorizer_type}
            param_grid_clf = param_grid.copy()

            if model_type == "DecisionTree":
                param_grid_clf.update({
                    'clf__criterion': ['gini', 'entropy'],
                    'clf__max_depth': [None, 10, 20, 30],
                    'clf__min_samples_split': [2, 5, 10]
                })
            elif model_type == "LogisticRegression":
                param_grid_clf.update({
                    'clf__C': [0.001, 0.01, 0.1, 1, 10],
                    'clf__penalty': ['l2']
                })
            elif model_type == "NaiveBayes":
                param_grid_clf.update({
                    'clf__alpha': [0.01, 0.1, 0.5, 1.0, 10.0]
                })
            else:
                raise ValueError("Invalid model type.")

            grid_search = HalvingGridSearchCV(
                estimator=pipeline,
                param_grid=param_grid_clf,
                cv=2,
                n_jobs=-1,
                scoring='accuracy',
                random_state=self.random_state
            )
            grid_search.fit(self.X_train, self.y_train)

            best_classifier = grid_search.best_estimator_

            y_pred_test = best_classifier.predict(self.X_test)
            accuracy_test = accuracy_score(self.y_test, y_pred_test)
            result["Test accuracy"] = accuracy_test

            results.append(result)

        return results

In [None]:
spam_classifier = SpamClassifier(data_path='spam_or_not_spam.csv', random_state=42)

spam_classifier.preprocess_data()
spam_classifier.split_data(test_size=0.2)

vectorizer_types = ["CountVectorizer", "TfidfVectorizer"]
evaluation_results = []

for vectorizer_type in vectorizer_types:
    results = spam_classifier.train_and_evaluate_models(vectorizer_type)
    evaluation_results.extend(results)

results_df = pd.DataFrame(evaluation_results)

In [91]:
print(results_df.sort_values("Test accuracy", ascending=False))

                Model       Vectorizer  Test accuracy
5          NaiveBayes  TfidfVectorizer       0.993333
1  LogisticRegression  CountVectorizer       0.990000
2          NaiveBayes  CountVectorizer       0.985000
4  LogisticRegression  TfidfVectorizer       0.985000
3        DecisionTree  TfidfVectorizer       0.953333
0        DecisionTree  CountVectorizer       0.950000
