In [49]:
import pandas as pd
import string
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import Pipeline

from nltk.tokenize import (
    word_tokenize,
)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('spam_or_not_spam.csv')
df.head(5)

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   object
 1   label   3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


In [4]:
df = df[df['email'].notna()]

In [5]:
stop_words = stopwords.words('english')
stop_words.extend('NUMBER')

In [6]:
def delete_digits(s):
    return re.sub("[0-9]", "", s)

def delete_punkt(s):
    return s.translate(str.maketrans('', '', string.punctuation + "«»—№–"))

lemmatizer = WordNetLemmatizer()
def normalize(text):
    tokens = word_tokenize(text, 'english')
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens
                  if token not in set(stop_words)]
    return ' '.join(lemmatized)

def preprocessing(text):
    text_wo_numbers = delete_digits(text)
    text_wo_punct = delete_punkt(text)
    text_lem = normalize(text_wo_punct)
    return text_lem

In [7]:
df['preproc_email'] = df['email'].apply(preprocessing)
df.head()

Unnamed: 0,email,label,preproc_email
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMBE...
1,martin a posted tassos papadopoulos the greek ...,0,martin posted tasso papadopoulos greek sculpto...
2,man threatens explosion in moscow thursday aug...,0,man threatens explosion moscow thursday august...
3,klez the virus that won t die already the most...,0,klez virus die already prolific virus ever kle...
4,in adding cream to spaghetti carbonara which ...,0,adding cream spaghetti carbonara effect pasta ...


In [8]:
def custom_tokenize(text):
    text = word_tokenize(text)
    return text

In [9]:
X = df['email']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
# создаем выборку через CountVectorizer
count_vectorizer = CountVectorizer(max_df=0.7, min_df=0.003, tokenizer=custom_tokenize)
X_train_count_vectorizer = count_vectorizer.fit_transform(X_train)
X_test_count_vectorizer = count_vectorizer.transform(X_test)

In [11]:
# создаем выборку через Tf-Idf
count_vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.003, tokenizer=custom_tokenize)
X_train_tfidf_vectorizer = count_vectorizer.fit_transform(X_train)
X_test_tfidf_vectorizer = count_vectorizer.transform(X_test)

## LogReg + TfIdf/CountVectorize pipelines

#### LogReg + Tfidf

In [None]:
pipe_LR_TF = Pipeline(
    steps=[
        ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
        ('lr', LogisticRegression())
    ]
)

param_grid_LR_TF = {
    'vectorizer__min_df': [0.003, 0.00],
    'vectorizer__max_df': [0.6, 0.7],
    'lr__C': [0.1, 1, 10],
    'lr__penalty': ['l2'],
    'lr__solver': ['liblinear', 'lbfgs', 'sag', 'saga'],
    'lr__max_iter': [100, 150],
    'lr__class_weight': [None, 'balanced'],
    'lr__dual': [False],
    'lr__tol': [0.0001, 0.001]
}

grid_pipeline_LR_TF = HalvingGridSearchCV(pipe_LR_TF, param_grid_LR_TF, verbose=1, n_jobs=-1)
grid_pipeline_LR_TF.fit(X_train,y_train)

In [53]:
grid_pipeline_LR_TF.best_params_

{'lr__C': 10,
 'lr__class_weight': 'balanced',
 'lr__dual': False,
 'lr__max_iter': 100,
 'lr__penalty': 'l2',
 'lr__solver': 'saga',
 'lr__tol': 0.001,
 'vectorizer__max_df': 0.6,
 'vectorizer__min_df': 0.0}

In [54]:
y_pred_lr_tf = grid_pipeline_LR_TF.best_estimator_.predict(X_test)
f1_lr_tf = f1_score(y_test, y_pred_lr_tf)
print(classification_report(y_test, y_pred_lr_tf))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       821
           1       0.99      0.93      0.96       169

    accuracy                           0.99       990
   macro avg       0.99      0.97      0.98       990
weighted avg       0.99      0.99      0.99       990



In [55]:
f1_lr_tf

0.9634146341463414

#### LogReg + CountVectorize

In [56]:
pipe_LR_CVec = Pipeline(
    steps=[
        ('vectorizer', CountVectorizer(tokenizer=custom_tokenize)),
        ('lr', LogisticRegression())
    ]
)

param_grid_LR_CVec = {
    'vectorizer__min_df': [0.003, 0.00],
    'vectorizer__max_df': [0.6, 0.7],
    'lr__C': [0.1, 1, 10],
    'lr__penalty': ['l2'],
    'lr__solver': ['liblinear', 'lbfgs', 'sag', 'saga'],
    'lr__max_iter': [100, 150],
    'lr__class_weight': [None, 'balanced'],
    'lr__dual': [False],
    'lr__tol': [0.0001, 0.001]
}

grid_pipeline_LR_CVec = HalvingGridSearchCV(pipe_LR_CVec, param_grid_LR_CVec, verbose=1, n_jobs=-1)
grid_pipeline_LR_CVec.fit(X_train,y_train)

n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 20
max_resources_: 2009
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 384
n_resources: 20
Fitting 5 folds for each of 384 candidates, totalling 1920 fits




----------
iter: 1
n_candidates: 128
n_resources: 60
Fitting 5 folds for each of 128 candidates, totalling 640 fits




----------
iter: 2
n_candidates: 43
n_resources: 180
Fitting 5 folds for each of 43 candidates, totalling 215 fits




----------
iter: 3
n_candidates: 15
n_resources: 540
Fitting 5 folds for each of 15 candidates, totalling 75 fits




----------
iter: 4
n_candidates: 5
n_resources: 1620
Fitting 5 folds for each of 5 candidates, totalling 25 fits




In [57]:
grid_pipeline_LR_CVec.best_params_

{'lr__C': 0.1,
 'lr__class_weight': 'balanced',
 'lr__dual': False,
 'lr__max_iter': 100,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear',
 'lr__tol': 0.001,
 'vectorizer__max_df': 0.7,
 'vectorizer__min_df': 0.003}

In [58]:
y_pred_lr_cvec = grid_pipeline_LR_CVec.best_estimator_.predict(X_test)
f1_lr_cvec = f1_score(y_test, y_pred_lr_cvec)
print(classification_report(y_test, y_pred_lr_cvec))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       821
           1       0.99      0.96      0.98       169

    accuracy                           0.99       990
   macro avg       0.99      0.98      0.99       990
weighted avg       0.99      0.99      0.99       990



### Сравнение моделей DecisionTree

In [59]:
nb_dict = {
    'DT+Tfidf HP': list(grid_pipeline_LR_CVec.best_params_.keys())+['f1-score'],
    'DT+Tfidf': list(grid_pipeline_LR_CVec.best_params_.values())+[f1_lr_cvec],
    'DT+CountVec':list(grid_pipeline_LR_TF.best_params_.values())+[f1_lr_tf]
}
nb = pd.DataFrame.from_dict(nb_dict)
nb

Unnamed: 0,DT+Tfidf HP,DT+Tfidf,DT+CountVec
0,lr__C,0.1,10
1,lr__class_weight,balanced,balanced
2,lr__dual,False,False
3,lr__max_iter,100,100
4,lr__penalty,l2,l2
5,lr__solver,liblinear,saga
6,lr__tol,0.001,0.001
7,vectorizer__max_df,0.7,0.6
8,vectorizer__min_df,0.003,0.0
9,f1-score,0.975904,0.963415


## DecisionTree + TfIdf/CountVectorize pipelines

#### DecisionTree + CountVectorize

In [60]:
pipe_dt_cvec = Pipeline(
        steps=[
            ('vectorizer', CountVectorizer(tokenizer=custom_tokenize)),
            ('dt', DecisionTreeClassifier())
        ]
    )

param_grid_dt_cvec = {
    'vectorizer__min_df': [0.003, 0.05],
    'vectorizer__max_df': [0.6, 0.7],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [10, 20],
    'dt__min_samples_split': [2, 5],
    'dt__min_samples_leaf': [2, 5],
    'dt__max_features': ['auto', 'log2'],
    'dt__class_weight': ['balanced'],
}

grid_search_dt_cv = HalvingGridSearchCV(pipe_dt_cvec, param_grid_dt_cvec, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_dt_cv.fit(X_train, y_train)



In [61]:
grid_search_dt_cv.best_params_

{'dt__class_weight': 'balanced',
 'dt__criterion': 'entropy',
 'dt__max_depth': 10,
 'dt__max_features': 'log2',
 'dt__min_samples_leaf': 2,
 'dt__min_samples_split': 2,
 'vectorizer__max_df': 0.7,
 'vectorizer__min_df': 0.05}

In [62]:
y_pred_dt_cvec = grid_search_dt_cv.best_estimator_.predict(X_test)
f1_dt_cvec = f1_score(y_test, y_pred_dt_cvec)
print(classification_report(y_test, y_pred_dt_cvec))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       821
           1       0.69      0.77      0.73       169

    accuracy                           0.90       990
   macro avg       0.82      0.85      0.83       990
weighted avg       0.91      0.90      0.90       990



#### DecisionTree + TfidfVectorizer

In [63]:
pipe_dt_tf = Pipeline(
    steps=[
        ('vectorizer', TfidfVectorizer(tokenizer=custom_tokenize)),
        ('dt', DecisionTreeClassifier())
    ]
)

param_grid_dt_tf = {
    'vectorizer__min_df': [0.003, 0.05],
    'vectorizer__max_df': [0.6, 0.7],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [10, 20],
    'dt__min_samples_split': [2, 5],
    'dt__min_samples_leaf': [2, 5],
    'dt__max_features': ['auto', 'log2'],
    'dt__class_weight': ['balanced'],
}

grid_search_dt_tf = HalvingGridSearchCV(pipe_dt_tf, param_grid_dt_tf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_dt_tf.fit(X_train, y_train)



In [64]:
grid_search_dt_tf.best_params_

{'dt__class_weight': 'balanced',
 'dt__criterion': 'gini',
 'dt__max_depth': 20,
 'dt__max_features': 'log2',
 'dt__min_samples_leaf': 2,
 'dt__min_samples_split': 5,
 'vectorizer__max_df': 0.7,
 'vectorizer__min_df': 0.05}

In [65]:
y_pred_dt_tf = grid_search_dt_cv.best_estimator_.predict(X_test)
f1_dt_tf = f1_score(y_test, y_pred_dt_tf)
print(classification_report(y_test, y_pred_dt_tf))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       821
           1       0.69      0.77      0.73       169

    accuracy                           0.90       990
   macro avg       0.82      0.85      0.83       990
weighted avg       0.91      0.90      0.90       990



### Сравнение моделей DecisionTree

In [66]:
nb_dict = {
    'DT+Tfidf HP': list(grid_search_dt_tf.best_params_.keys())+['f1-score'],
    'DT+Tfidf': list(grid_search_dt_tf.best_params_.values())+[f1_dt_tf],
    'DT+CountVec':list(grid_search_dt_cv.best_params_.values())+[f1_dt_cvec]
}
nb = pd.DataFrame.from_dict(nb_dict)
nb

Unnamed: 0,DT+Tfidf HP,DT+Tfidf,DT+CountVec
0,dt__class_weight,balanced,balanced
1,dt__criterion,gini,entropy
2,dt__max_depth,20,10
3,dt__max_features,log2,log2
4,dt__min_samples_leaf,2,2
5,dt__min_samples_split,5,2
6,vectorizer__max_df,0.7,0.7
7,vectorizer__min_df,0.05,0.05
8,f1-score,0.726257,0.726257


## MultinomialNB + TfIdf/CountVectorize pipelines

#### MultinomialNB + Tdidf

In [67]:
pipe_nb_tf = Pipeline(
    steps=[
        ('vectorizer', TfidfVectorizer(tokenizer=custom_tokenize)),
        ('nb', MultinomialNB())
    ]
)

param_grid_nb_tf = {
    'vectorizer__min_df': [0.002, 0.003],
    'vectorizer__max_df': [0.5, 0.6, 0.7],
    'nb__alpha': [0.1, 0.2, 0.3],
}

grid_search_nb_tf = HalvingGridSearchCV(pipe_nb_tf, param_grid_nb_tf, cv=5, scoring='f1', n_jobs=-1)
grid_search_nb_tf.fit(X_train, y_train)



In [68]:
grid_search_nb_tf.best_params_

{'nb__alpha': 0.1, 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 0.002}

In [69]:
y_pred_nb_tf = grid_search_nb_tf.best_estimator_.predict(X_test)
f1_nb_tf = f1_score(y_test, y_pred_nb_tf)
print(classification_report(y_test, y_pred_nb_tf))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       821
           1       0.99      0.89      0.94       169

    accuracy                           0.98       990
   macro avg       0.99      0.95      0.96       990
weighted avg       0.98      0.98      0.98       990



#### MultinomialNB + CountVectorize

In [70]:
pipe_nb_cvec = Pipeline(
    steps=[
        ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),
        ('nb', MultinomialNB())
    ]
)

param_grid_nb_cvec = {
    'vectorizer__min_df': [0.002, 0.003],
    'vectorizer__max_df': [0.5, 0.6, 0.7],
    'nb__alpha': [0.1, 0.2, 0.3, 0.4, 0.5],
}

grid_search_nb_cvec = HalvingGridSearchCV(pipe_nb_cvec, param_grid_nb_cvec, cv=5, scoring='f1', n_jobs=-1, refit=True)
grid_search_nb_cvec.fit(X_train, y_train)



In [71]:
grid_search_nb_cvec.best_params_

{'nb__alpha': 0.1, 'vectorizer__max_df': 0.5, 'vectorizer__min_df': 0.003}

In [72]:
grid_search_nb_cvec.best_score_

0.9604899771108701

In [73]:
y_pred_nb_cvec = grid_search_nb_cvec.best_estimator_.predict(X_test)
f1_nb_cvec = f1_score(y_test, y_pred_nb_cvec)
print(classification_report(y_test, y_pred_nb_cvec))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       821
           1       1.00      0.93      0.97       169

    accuracy                           0.99       990
   macro avg       0.99      0.97      0.98       990
weighted avg       0.99      0.99      0.99       990



In [74]:
f1_nb_cvec

0.9663608562691132

### Сравнение моделей NaiveBayes

In [75]:
nb_dict = {
    'NB+Tfidf HP': list(grid_search_nb_tf.best_params_.keys())+['f1-score'],
    'NB+Tfidf': list(grid_search_nb_tf.best_params_.values())+[f1_nb_tf],
    'NB+CountVec':list(grid_search_nb_cvec.best_params_.values())+[f1_nb_cvec]
}
nb = pd.DataFrame.from_dict(nb_dict)
nb

Unnamed: 0,NB+Tfidf HP,NB+Tfidf,NB+CountVec
0,nb__alpha,0.1,0.1
1,vectorizer__max_df,0.5,0.5
2,vectorizer__min_df,0.002,0.003
3,f1-score,0.94081,0.966361


## Сравнение всех моделей

In [76]:
res_dict = {
    'Model':['LogReg + TF', 'LogReg + CountVec', 'DecTree + TF', 'DecTree + CountVec', 'NB + TF', 'NB + CountVec'],
    'F1 score': [f1_lr_tf, f1_lr_cvec, f1_dt_tf, f1_dt_cvec, f1_nb_tf, f1_nb_cvec],
}
res = pd.DataFrame.from_dict(res_dict)
res

Unnamed: 0,Model,F1 score
0,LogReg + TF,0.963415
1,LogReg + CountVec,0.975904
2,DecTree + TF,0.726257
3,DecTree + CountVec,0.726257
4,NB + TF,0.94081
5,NB + CountVec,0.966361
