In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import cosine
from sklearn.metrics import classification_report
import warnings
import gensim.models

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
!python -m spacy download en_core_web_sm

# Получение данных
[
Spam or Not Spam Dataset](https://www.kaggle.com/datasets/ozlerhakan/spam-or-not-spam-dataset/)

In [3]:
data = pd.read_csv('spam_or_not_spam.csv')
data = data.astype({'email':'string'})
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [4]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2999 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   string
 1   label   2999 non-null   int64 
dtypes: int64(1), string(1)
memory usage: 70.3 KB


In [5]:
data['label'].value_counts()

label
0    2500
1     499
Name: count, dtype: int64

# Очистка

Стоп-слова для английского языка в spaCy

In [6]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
print(f'Spacy english stopwords size: {len(stopwords)}', end='\n\n')
' '.join(stopwords)

Spacy english stopwords size: 326



"towards i anyone already per yours mostly itself ’d please throughout what some moreover ‘d top yet ’m only sixty even anyway least one forty others made eleven neither why whereafter above will wherein former several hundred about they up upon than before had ’re someone seem except ourselves sometime which wherever hereupon beside otherwise whether often off became always ‘m everyone the with ’s everything become 's such or without here myself 've nobody whereupon doing nevertheless whenever together does due just full keep toward ever thereupon three its unless no every are two formerly of really until to me 'd nine within yourselves them nowhere show would done regarding noone much each n‘t during you re next beforehand were bottom she thence cannot none front almost these take where through for whoever go this around never between move becomes he over our name either therein ours hence an as eight somewhere along ca latterly many among being another behind can herein namely ‘re s

Атрибуты [Token](https://spacy.io/api/token#attributes)

In [7]:
%%time

data['cleaned_text'] = data['email'].apply(
    lambda x: [
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    ]
)
data.head()

CPU times: total: 1min 58s
Wall time: 2min 1s


Unnamed: 0,email,label,cleaned_text
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,"[date, d, number, aug, number, number, number,..."
1,martin a posted tassos papadopoulos the greek ...,0,"[martin, post, tassos, papadopoulo, greek, scu..."
2,man threatens explosion in moscow thursday aug...,0,"[man, threaten, explosion, moscow, thursday, a..."
3,klez the virus that won t die already the most...,0,"[klez, virus, win, t, die, prolific, virus, kl..."
4,in adding cream to spaghetti carbonara which ...,0,"[add, cream, spaghetti, carbonara, effect, pas..."


# Векторизация: Word2Vec CBOW

In [37]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], random_state=2023)

In [38]:
%%time
VECTOR_SIZE = 256
model = gensim.models.Word2Vec(
    sentences=X_train,
    vector_size=VECTOR_SIZE, # default = 100
    window=7, # default = 5
    min_count=10,
    sg=1, # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0, #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5, # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25, # Number of iterations (epochs) over the corpus
    seed=2023,
)

CPU times: total: 1min 6s
Wall time: 23.2 s


Intrinsic оценка качества эмбеддингов

In [39]:
# model.wv.most_similar(['please'])
# "Key 'please' not present in vocabulary"

Видимо, оно было исключено, как часто встречаемое.

In [40]:
model.wv.most_similar(['year'])

[('ago', 0.4501018226146698),
 ('age', 0.31290510296821594),
 ('fortune', 0.31080934405326843),
 ('critic', 0.30952364206314087),
 ('roongthip', 0.30392321944236755),
 ('cleaning', 0.3028695285320282),
 ('quarter', 0.2958633601665497),
 ('thai', 0.2842932343482971),
 ('completion', 0.2838909924030304),
 ('graduate', 0.2837061285972595)]

In [41]:
model.wv.most_similar(['yes'])

[('waider', 0.32578831911087036),
 ('insane', 0.2726579010486603),
 ('diagnostic', 0.2679702043533325),
 ('ro', 0.26339051127433777),
 ('skeptical', 0.2619013488292694),
 ('sheep', 0.2600741684436798),
 ('yer', 0.2566085755825043),
 ('flag', 0.25405460596084595),
 ('ed', 0.25297287106513977),
 ('prostitute', 0.25235515832901)]

In [42]:
model.wv.most_similar(['hi'])

[('egwn', 0.3757016658782959),
 ('alvie', 0.3662855625152588),
 ('tar', 0.35801631212234497),
 ('poker', 0.3545455038547516),
 ('charset', 0.35070282220840454),
 ('harri', 0.34718915820121765),
 ('eircom', 0.34385186433792114),
 ('mord', 0.338209867477417),
 ('automate', 0.33741509914398193),
 ('z', 0.32958826422691345)]

In [43]:
model.wv.doesnt_match(['yes', 'no', 'I', 'agree'])

'agree'

In [44]:
model.wv.doesnt_match(['week', 'year', 'month', 'car', 'second'])

'second'

In [45]:
model.wv.doesnt_match(['milk', 'bread', 'sky', 'cherry'])

'sky'

In [50]:
def cos(v1, v2):
    return 1 - cosine(v1, v2)

def correlation():
    with open("Таблицы для оценки эмбеддингов/SimLex-999.txt", "r") as file:
        cos_arr = []
        simlex_arr = []
        next(file)
        for line in file:
            a = line.split()
            try:
                coss = cos(model.wv[a[0]], model.wv[a[1]])
                cos_arr.append(coss)
                simlex_arr.append(float(a[3]))
            except:
                pass
    return np.corrcoef(cos_arr, simlex_arr)

r = correlation()
r

array([[1.        , 0.03452344],
       [0.03452344, 1.        ]])

Не очень хороший корпус, doesnt_match и most_similar могут выдавать странные результаты, косинусное расстояние между парами эмбеддингов не коррелирует с SimLex (но это не значит, что у них нет связи).

# Классификация

In [51]:
X_train[:5]

1826    [url, url, date, tue, number, sep, number, num...
122     [let, write, mind, know, awful, lot, oh, inter...
1726    [tim, peters, write, ve, get, summary, file, c...
1123    [hal, devore, say, brent, say, book, paste, fu...
1836    [url, url, date, supply, carny, concessionaire...
Name: cleaned_text, dtype: object

In [52]:
# Получаем эмбеддинг для текста
def document_vector(doc):
    doc = [model.wv[word] for word in doc if word in model.wv.key_to_index.keys()]
    if not doc:
        return [0 for i in range(VECTOR_SIZE)]
    return np.mean(doc, axis=0)

X = list(X_train.apply(document_vector))
X_test_ = list(X_test.apply(document_vector))
X[:1]

[array([-0.00496055, -0.07714188,  0.01091508,  0.07927296,  0.06003686,
        -0.04377781,  0.02439905, -0.31756824, -0.17448705,  0.10151234,
         0.05152039, -0.2149051 , -0.03201369,  0.0373418 , -0.18609734,
         0.0345365 ,  0.07851401,  0.19927263, -0.1290992 , -0.07626686,
        -0.04006838, -0.10021342,  0.0415928 , -0.05251959, -0.15674683,
        -0.05584528,  0.04873844, -0.05055373,  0.0428902 , -0.11151214,
         0.07711145, -0.09085753, -0.17528711, -0.09000844,  0.1586474 ,
        -0.09739301, -0.22469726,  0.02275746,  0.07961558,  0.0374274 ,
         0.11444626,  0.10080155,  0.10311528, -0.21540403,  0.07663862,
         0.1615595 ,  0.07117678, -0.09629361, -0.27383086, -0.10360359,
        -0.06900878,  0.13113798,  0.0195047 ,  0.10483648,  0.01638907,
         0.13924871, -0.04046673, -0.14869334,  0.15351908,  0.05855192,
        -0.02404684,  0.18745303, -0.16233197, -0.07317162, -0.03365894,
        -0.0945463 ,  0.0142761 , -0.13006046, -0.0

In [53]:
clf = LogisticRegression()
clf.fit(X, y_train)

y_pred = clf.predict(X_test_)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       616
           1       0.99      0.92      0.95       134

    accuracy                           0.98       750
   macro avg       0.99      0.96      0.97       750
weighted avg       0.98      0.98      0.98       750



# Векторизация: FastText

In [54]:
%%time
VECTOR_SIZE = 256
model = gensim.models.FastText(
    sentences=X_train,
    vector_size=VECTOR_SIZE, # default = 100
    window=7, # default = 5
    min_count=10,
    sg=1, # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0, #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5, # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25, # Number of iterations (epochs) over the corpus
    seed=2023,
)

CPU times: total: 2min 43s
Wall time: 1min


Intrinsic оценка качества эмбеддингов

In [55]:
model.wv.most_similar(['please'])

[('pleased', 0.8312321901321411),
 ('lease', 0.8252145648002625),
 ('release', 0.7291569709777832),
 ('ease', 0.5907385945320129),
 ('taongi', 0.38943272829055786),
 ('announcement', 0.38047829270362854),
 ('hawaii', 0.37859633564949036),
 ('november', 0.37410104274749756),
 ('hat', 0.37103644013404846),
 ('melchizedek', 0.3686017692089081)]

In [56]:
model.wv.most_similar(['year'])

[('ago', 0.43607810139656067),
 ('tear', 0.34255263209342957),
 ('boom', 0.33648842573165894),
 ('ear', 0.32442706823349),
 ('yeah', 0.32187536358833313),
 ('quarter', 0.31909021735191345),
 ('roongthip', 0.3099408745765686),
 ('fortune', 0.29858332872390747),
 ('hindu', 0.2835412621498108),
 ('decade', 0.28084489703178406)]

In [57]:
model.wv.most_similar(['hi'])

[('alvie', 0.36906862258911133),
 ('shanumber', 0.35358619689941406),
 ('numberanumber', 0.34773364663124084),
 ('eircom', 0.3470231890678406),
 ('suse', 0.3432796895503998),
 ('mord', 0.3382689952850342),
 ('numberbit', 0.33592864871025085),
 ('yannick', 0.33114826679229736),
 ('hash', 0.3303244709968567),
 ('egwn', 0.3283159136772156)]

In [58]:
model.wv.doesnt_match(['yes', 'no', 'I', 'agree'])

'I'

In [59]:
model.wv.doesnt_match(['week', 'year', 'month', 'car', 'second'])

'second'

In [60]:
model.wv.doesnt_match(['milk', 'bread', 'sky', 'cherry'])

'bread'

In [61]:
r = correlation()
r

array([[1.        , 0.02755002],
       [0.02755002, 1.        ]])

In [62]:
cos(model.wv['old'], model.wv['new']) # simlex A, 1.58

0.16400949656963348

In [63]:
cos(model.wv['bread'], model.wv['flour']) # simlex N, 3.33

0.24328738451004028

In [64]:
cos(model.wv['bad'], model.wv['awful']) # simlex A, 8.42

0.1955680102109909

In [65]:
cos(model.wv['large'], model.wv['huge']) # simlex A, 9.47

0.1391594260931015

In [66]:
cos(model.wv['cat'], model.wv['dog']) # simlex N, 1.75

0.28068363666534424

FastText тоже ошибается, но уже на других примерах, корреляции с SimLex тоже нет.

# Классификация

In [67]:
# Получаем эмбеддинг для текста
def document_vector(doc):
    doc = [model.wv[word] for word in doc if word in model.wv.key_to_index.keys()]
    if not doc:
        return [0 for i in range(VECTOR_SIZE)]
    return np.mean(doc, axis=0)

X = list(X_train.apply(document_vector))
X_test_ = list(X_test.apply(document_vector))

In [68]:
clf = LogisticRegression()
clf.fit(X, y_train)

y_pred = clf.predict(X_test_)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       616
           1       0.99      0.91      0.95       134

    accuracy                           0.98       750
   macro avg       0.99      0.95      0.97       750
weighted avg       0.98      0.98      0.98       750



И для Word2Vec CBOW, и для FastText эмбеддингов логистическая регрессия выдала примерно одинаковый результат.