In [96]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
import gensim.models

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
!python -m spacy download en_core_web_sm

# Получение данных
[
Spam or Not Spam Dataset](https://www.kaggle.com/datasets/ozlerhakan/spam-or-not-spam-dataset/)

In [98]:
data = pd.read_csv('spam_or_not_spam.csv')
data = data.astype({'email':'string'})
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [99]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2999 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   string
 1   label   2999 non-null   int64 
dtypes: int64(1), string(1)
memory usage: 70.3 KB


In [100]:
data['label'].value_counts()

label
0    2500
1     499
Name: count, dtype: int64

# Очистка

Стоп-слова для английского языка в spaCy

In [101]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
print(f'Spacy english stopwords size: {len(stopwords)}', end='\n\n')
' '.join(stopwords)

Spacy english stopwords size: 326



"together six under show another third even for thereby never ’ve upon ’ll same yours else ours first yourself below can these eight almost several also its whither with themselves otherwise a n't why being behind since over towards only less put becomes neither thence whatever nor your mine itself their is or 'd well herein become four side as so further just he see due via anyhow however doing each me therein always someone everywhere such most empty not until make 'm on again regarding are twelve from whereby him formerly becoming hereafter any rather she twenty whereupon anyway every indeed throughout 's you our by everyone was here move few whom ‘ll n‘t thru this amount 're two them moreover whenever except give afterwards might thus many serious during there 'll ever ’re and could other myself hereupon ’m anywhere about whence either re ca ‘d keep ‘ve more while where elsewhere ’s am at already much although the next go nine if yourselves then back into between in both yet part p

Атрибуты [Token](https://spacy.io/api/token#attributes)

In [102]:
%%time

data['cleaned_text'] = data['email'].apply(
    lambda x: [
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    ]
)
data.head()

CPU times: total: 2min 3s
Wall time: 2min 4s


Unnamed: 0,email,label,cleaned_text
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,"[date, d, number, aug, number, number, number,..."
1,martin a posted tassos papadopoulos the greek ...,0,"[martin, post, tassos, papadopoulo, greek, scu..."
2,man threatens explosion in moscow thursday aug...,0,"[man, threaten, explosion, moscow, thursday, a..."
3,klez the virus that won t die already the most...,0,"[klez, virus, win, t, die, prolific, virus, kl..."
4,in adding cream to spaghetti carbonara which ...,0,"[add, cream, spaghetti, carbonara, effect, pas..."


# Векторизация: Word2Vec CBOW

In [103]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], random_state=2023)

In [104]:
%%time
VECTOR_SIZE = 256
model = gensim.models.Word2Vec(
    sentences=X_train,
    vector_size=VECTOR_SIZE, # default = 100
    window=7, # default = 5
    min_count=10,
    sg=1, # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0, #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5, # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25, # Number of iterations (epochs) over the corpus
    seed=2023,
)

CPU times: total: 1min 7s
Wall time: 24.4 s


Intrinsic оценка качества эмбеддингов

In [105]:
# model.wv.most_similar(['please'])

KeyError: "Key 'please' not present in vocabulary"

In [106]:
model.wv.most_similar(['year'])

[('ago', 0.4628926217556),
 ('critic', 0.35923197865486145),
 ('cleaning', 0.3206169009208679),
 ('regulator', 0.3192872107028961),
 ('graduate', 0.3017882704734802),
 ('quarter', 0.29894009232521057),
 ('fortune', 0.29685911536216736),
 ('percent', 0.29421526193618774),
 ('decade', 0.29244181513786316),
 ('hundred', 0.2905483543872833)]

In [107]:
model.wv.most_similar(['yes'])

[('waider', 0.3095357120037079),
 ('diagnostic', 0.2753773331642151),
 ('ro', 0.27116987109184265),
 ('skeptical', 0.2611181139945984),
 ('sheep', 0.25926998257637024),
 ('insane', 0.25922849774360657),
 ('systemwork', 0.2590065896511078),
 ('integral', 0.2578268051147461),
 ('location', 0.24919913709163666),
 ('postal', 0.24743232131004333)]

In [108]:
model.wv.most_similar(['hi'])

[('egwn', 0.357582151889801),
 ('tar', 0.3538835048675537),
 ('mord', 0.35096606612205505),
 ('alvie', 0.3468831479549408),
 ('iso', 0.3438396155834198),
 ('justin', 0.3435012400150299),
 ('suse', 0.34079691767692566),
 ('digest', 0.3402199447154999),
 ('z', 0.33434292674064636),
 ('securityfocus', 0.33361315727233887)]

In [109]:
model.wv.doesnt_match(['yes', 'no', 'I', 'agree'])

'yes'

In [110]:
model.wv.doesnt_match(['week', 'year', 'month', 'car', 'second'])

'second'

In [111]:
model.wv.doesnt_match(['milk', 'bread', 'sky', 'cherry'])

'sky'

Не очень хороший корпус, где-то ошибается.

# Классификация

In [112]:
X_train

1826    [url, url, date, tue, number, sep, number, num...
122     [let, write, mind, know, awful, lot, oh, inter...
1726    [tim, peters, write, ve, get, summary, file, c...
1123    [hal, devore, say, brent, say, book, paste, fu...
1836    [url, url, date, supply, carny, concessionaire...
                              ...                        
1953                        [url, url, date, supply, url]
2743    [url, email, sponsor, osdn, tired, old, cell, ...
2502    [number, fight, risk, cancer, url, number, sli...
1561    [thu, sep, number, number, number, number, num...
855     [original, message, gary, lawrence, murphy, ga...
Name: cleaned_text, Length: 2249, dtype: object

In [113]:
# Получаем эмбеддинг для текста
def document_vector(doc):
    doc = [model.wv[word] for word in doc if word in model.wv.key_to_index.keys()]
    if doc == []:
        return [0 for i in range(VECTOR_SIZE)]
    return np.mean(doc, axis=0)

X = list(X_train.apply(document_vector))
X_test_ = list(X_test.apply(document_vector))
X

[array([ 2.76934262e-03, -7.83861428e-02, -4.54314910e-02,  3.07542477e-02,
         8.07556286e-02, -2.12888364e-02,  1.91212143e-03, -2.68369257e-01,
        -2.09231943e-01,  6.81395829e-02,  1.33434804e-02, -1.97013244e-01,
         4.64863610e-03,  1.03042960e-01, -3.33820313e-01,  1.01430498e-01,
        -3.49107571e-02,  1.84482008e-01, -2.16911748e-01, -5.28413169e-02,
        -6.15932047e-03, -1.96598485e-01,  3.14396061e-03, -3.60742435e-02,
        -1.58025786e-01, -7.30277449e-02,  2.88940575e-02,  1.97554268e-02,
         1.17751800e-01, -4.89424430e-02, -4.60033789e-02, -6.99835196e-02,
        -1.34003177e-01, -1.36134103e-01,  1.52961284e-01, -6.52266517e-02,
        -3.35801363e-01,  2.22276170e-02,  6.71886876e-02, -4.49466370e-02,
        -2.68169306e-02,  1.24076650e-01,  1.71066031e-01, -1.02465406e-01,
         1.26317263e-01,  1.12709627e-01,  9.19478238e-02, -9.08295140e-02,
        -1.42961323e-01, -1.90446138e-01, -1.33090436e-01,  1.19336806e-01,
        -2.0

In [114]:
clf = LogisticRegression()
clf.fit(X, y_train)

y_pred = clf.predict(X_test_)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       616
           1       0.99      0.92      0.95       134

    accuracy                           0.98       750
   macro avg       0.99      0.96      0.97       750
weighted avg       0.98      0.98      0.98       750



# Векторизация: fastText

In [None]:
%%time
VECTOR_SIZE = 256
model = gensim.models.FastText(
    sentences=X_train,
    vector_size=VECTOR_SIZE, # default = 100
    window=7, # default = 5
    min_count=10,
    sg=1, # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0, #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5, # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25, # Number of iterations (epochs) over the corpus
    seed=2023,
)

Intrinsic оценка качества эмбеддингов

In [None]:
model.wv.most_similar(['year'])

In [None]:
model.wv.most_similar(['hi'])

In [None]:
model.wv.doesnt_match(['yes', 'no', 'I', 'agree'])

In [None]:
model.wv.doesnt_match(['week', 'year', 'month', 'car', 'second'])

In [None]:
model.wv.doesnt_match(['milk', 'bread', 'sky', 'cherry'])

# Классификация

In [None]:
# Получаем эмбеддинг для текста
def document_vector(doc):
    doc = [model.wv[word] for word in doc if word in model.wv.key_to_index.keys()]
    if doc == []:
        return [0 for i in range(VECTOR_SIZE)]
    return np.mean(doc, axis=0)

X = list(X_train.apply(document_vector))
X_test_ = list(X_test.apply(document_vector))
X

In [None]:
clf = LogisticRegression()
clf.fit(X, y_train)

y_pred = clf.predict(X_test_)
print(classification_report(y_test, y_pred))

Работает примерно одинаково.