In [None]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from gensim.models import FastText
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import cosine
from sklearn.metrics import classification_report
import warnings
import gensim.models

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
!python -m spacy download en_core_web_sm

# Получение данных
[
Spam or Not Spam Dataset](https://www.kaggle.com/datasets/ozlerhakan/spam-or-not-spam-dataset/)

In [None]:
data = pd.read_csv('spam_or_not_spam.csv')
data = data.astype({'email':'string'})
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [None]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2999 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   string
 1   label   2999 non-null   int64 
dtypes: int64(1), string(1)
memory usage: 70.3 KB


In [None]:
data['label'].value_counts()

label
0    2500
1     499
Name: count, dtype: int64

# Очистка

Стоп-слова для английского языка в spaCy

In [None]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
print(f'Spacy english stopwords size: {len(stopwords)}', end='\n\n')
' '.join(stopwords)

Spacy english stopwords size: 326



"always show elsewhere get if beside otherwise such your whatever more been ‘d whole ’ll almost others becoming this every him may there beforehand next should six out thru yours somewhere 're 've least nothing the somehow until just 'd he two above go n‘t afterwards off them perhaps were else became she either too here hence nine anything our not former upon yet many four whenever seemed empty across sometime for each everyone have bottom might of via we her really ‘re everywhere toward well beyond once hereafter ’ve around as nevertheless see most than is behind anywhere whereby ten except hundred fifteen much you whoever keep neither thereby few back whither onto rather has again nor per top although can part who ca along both when among noone his whence further only amongst latter everything before does hereby third another ‘s someone anyone whereafter nobody at what throughout unless full itself take also ’d since do fifty various had or which none ‘m any how please below me first

Атрибуты [Token](https://spacy.io/api/token#attributes)

In [None]:
%%time

data['cleaned_text'] = data['email'].apply(
    lambda x: [
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    ]
)
data.head()

CPU times: total: 1min 57s
Wall time: 1min 58s


Unnamed: 0,email,label,cleaned_text
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0,"[date, d, number, aug, number, number, number,..."
1,martin a posted tassos papadopoulos the greek ...,0,"[martin, post, tassos, papadopoulo, greek, scu..."
2,man threatens explosion in moscow thursday aug...,0,"[man, threaten, explosion, moscow, thursday, a..."
3,klez the virus that won t die already the most...,0,"[klez, virus, win, t, die, prolific, virus, kl..."
4,in adding cream to spaghetti carbonara which ...,0,"[add, cream, spaghetti, carbonara, effect, pas..."


# Векторизация: Word2Vec CBOW

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], random_state=2023)

In [23]:
%%time
VECTOR_SIZE = 256
model = gensim.models.Word2Vec(
    sentences=X_train,
    vector_size=VECTOR_SIZE, # default = 100
    window=7, # default = 5
    min_count=10,
    sg=1, # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0, #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5, # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25, # Number of iterations (epochs) over the corpus
    seed=2023,
)

CPU times: total: 1min 7s
Wall time: 24.4 s


Intrinsic оценка качества эмбеддингов

In [None]:
# model.wv.most_similar(['please'])

KeyError: "Key 'please' not present in vocabulary"

In [11]:
model.wv.most_similar(['year'])

[('ago', 0.48007732629776),
 ('critic', 0.32793354988098145),
 ('cleaning', 0.3176586627960205),
 ('decline', 0.306522399187088),
 ('fortune', 0.3062097430229187),
 ('quarter', 0.30355849862098694),
 ('decade', 0.30012303590774536),
 ('ancient', 0.29547443985939026),
 ('boom', 0.29300931096076965),
 ('roongthip', 0.2895958125591278)]

In [12]:
model.wv.most_similar(['yes'])

[('waider', 0.38061824440956116),
 ('diagnostic', 0.310423880815506),
 ('integral', 0.3077784776687622),
 ('systemwork', 0.29766395688056946),
 ('insane', 0.29694104194641113),
 ('sheep', 0.2927458882331848),
 ('special', 0.29171574115753174),
 ('yer', 0.2860548198223114),
 ('skeptical', 0.28199058771133423),
 ('ro', 0.27686643600463867)]

In [13]:
model.wv.most_similar(['hi'])

[('egwn', 0.3719348609447479),
 ('tar', 0.36590030789375305),
 ('alvie', 0.35488829016685486),
 ('automate', 0.3468570411205292),
 ('mord', 0.34682178497314453),
 ('charset', 0.3431834876537323),
 ('iso', 0.3379915654659271),
 ('hash', 0.336651086807251),
 ('eircom', 0.3355945348739624),
 ('lance', 0.33499741554260254)]

In [14]:
model.wv.doesnt_match(['yes', 'no', 'I', 'agree'])

'yes'

In [15]:
model.wv.doesnt_match(['week', 'year', 'month', 'car', 'second'])

'car'

In [16]:
model.wv.doesnt_match(['milk', 'bread', 'sky', 'cherry'])

'sky'

In [17]:
def cos(v1, v2):
    return 1 - cosine(v1, v2)

def correlation():
    file = open("Таблицы для оценки эмбеддингов/SimLex-999.txt", "r")
    lines = file.readlines()
    cos_arr = []
    simlex_arr = []
    for line in lines[1:]:
        a = line.split()
        try:
            coss = cos(model.wv[a[0]], model.wv[a[1]])
            cos_arr.append(coss)
            simlex_arr.append(float(a[3]))
        except:
            pass
    file.close()
    return np.corrcoef(cos_arr, simlex_arr)

r = correlation()
r

array([[1.        , 0.04389708],
       [0.04389708, 1.        ]])

Не очень хороший корпус, doesnt_match и most_similar могут выдавать странные результаты, косинусное расстояние между парами эмбеддингов не коррелирует с SimLex (но это не значит, что у них нет связи).

# Классификация

In [18]:
X_train[:5]

1826    [url, url, date, tue, number, sep, number, num...
122     [let, write, mind, know, awful, lot, oh, inter...
1726    [tim, peters, write, ve, get, summary, file, c...
1123    [hal, devore, say, brent, say, book, paste, fu...
1836    [url, url, date, supply, carny, concessionaire...
Name: cleaned_text, dtype: object

In [24]:
# Получаем эмбеддинг для текста
def document_vector(doc):
    doc = [model.wv[word] for word in doc if word in model.wv.key_to_index.keys()]
    if not doc:
        return [0 for i in range(VECTOR_SIZE)]
    return np.mean(doc, axis=0)

X = list(X_train.apply(document_vector))
X_test_ = list(X_test.apply(document_vector))
X[:1]

[array([-3.90341952e-02, -4.37641330e-02, -5.81686106e-03,  4.76925969e-02,
         1.18507251e-01, -2.67319679e-02, -2.23580152e-02, -2.80302584e-01,
        -1.64315239e-01,  1.07235335e-01,  8.68953392e-02, -2.59705663e-01,
        -8.17224830e-02,  5.85795566e-02, -3.00729722e-01,  1.29397407e-01,
        -5.68541512e-02,  2.30266884e-01, -2.56217390e-01,  6.86596856e-02,
         7.08904210e-03, -8.91305506e-02, -1.03342691e-02, -1.20466754e-01,
        -7.20183328e-02, -9.12771672e-02,  1.26936678e-02,  3.31341103e-02,
         3.70740481e-02, -4.12742421e-02, -1.46724638e-02, -1.28047289e-02,
        -1.84561759e-01,  2.82970560e-03,  4.35091481e-02, -1.46791875e-01,
        -2.24418014e-01,  1.44622430e-01,  1.86231524e-01, -8.78849626e-02,
        -5.24196848e-02,  4.16973159e-02,  1.20163128e-01, -1.26813427e-01,
         1.69939712e-01,  1.62201539e-01,  3.55117843e-02, -5.82135543e-02,
        -6.66021928e-02, -1.26794800e-01, -6.65922314e-02,  5.30891232e-02,
         5.6

In [25]:
clf = LogisticRegression()
clf.fit(X, y_train)

y_pred = clf.predict(X_test_)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       616
           1       0.98      0.92      0.95       134

    accuracy                           0.98       750
   macro avg       0.98      0.96      0.97       750
weighted avg       0.98      0.98      0.98       750



# Векторизация: FastText

In [26]:
%%time
VECTOR_SIZE = 256
model = gensim.models.FastText(
    sentences=X_train,
    vector_size=VECTOR_SIZE, # default = 100
    window=7, # default = 5
    min_count=10,
    sg=1, # Training algorithm: 1 for skip-gram; otherwise CBOW
    hs=0, #  If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
    negative=5, # If > 0, negative sampling will be used, if set to 0, no negative sampling is used.
    epochs=25, # Number of iterations (epochs) over the corpus
    seed=2023,
)

CPU times: total: 3min 16s
Wall time: 1min 40s


Intrinsic оценка качества эмбеддингов

In [27]:
model.wv.most_similar(['please'])

[('pleased', 0.8277225494384766),
 ('lease', 0.8145703673362732),
 ('release', 0.7282434701919556),
 ('ease', 0.5974870920181274),
 ('taongi', 0.37620288133621216),
 ('silverstein', 0.37528330087661743),
 ('november', 0.37056368589401245),
 ('increase', 0.3670535087585449),
 ('decrease', 0.36150550842285156),
 ('hawaii', 0.36009329557418823)]

In [28]:
model.wv.most_similar(['year'])

[('ago', 0.4534580111503601),
 ('quarter', 0.3285188376903534),
 ('fortune', 0.31809350848197937),
 ('yeah', 0.31695255637168884),
 ('tear', 0.30521297454833984),
 ('decade', 0.3046306371688843),
 ('fear', 0.30234983563423157),
 ('ear', 0.2993944585323334),
 ('yer', 0.29226720333099365),
 ('roongthip', 0.2916625440120697)]

In [29]:
model.wv.most_similar(['hi'])

[('alvie', 0.3735935389995575),
 ('shanumber', 0.37144735455513),
 ('poker', 0.3562904894351959),
 ('numberanumber', 0.35401031374931335),
 ('mplayer', 0.3452795147895813),
 ('libanumber', 0.3444075584411621),
 ('phil', 0.34387990832328796),
 ('yannick', 0.3426068127155304),
 ('numberpm', 0.3407646715641022),
 ('suse', 0.3402622640132904)]

In [30]:
model.wv.doesnt_match(['yes', 'no', 'I', 'agree'])

'I'

In [31]:
model.wv.doesnt_match(['week', 'year', 'month', 'car', 'second'])

'car'

In [32]:
model.wv.doesnt_match(['milk', 'bread', 'sky', 'cherry'])

'bread'

In [33]:
r = correlation()
r

array([[1.        , 0.02055517],
       [0.02055517, 1.        ]])

In [34]:
cos(model.wv['old'], model.wv['new']) # simlex A, 1.58

0.17172729969024658

In [35]:
cos(model.wv['bread'], model.wv['flour']) # simlex N, 3.33

0.2806139588356018

In [36]:
cos(model.wv['bad'], model.wv['awful']) # simlex A, 8.42

0.17136511206626892

In [37]:
cos(model.wv['large'], model.wv['huge']) # simlex A, 9.47

0.17018096148967743

In [38]:
cos(model.wv['cat'], model.wv['dog']) # simlex N, 1.75

0.21786004304885864

FastText тоже ошибается, но уже на других примерах, корреляции с SimLex тоже нет.

# Классификация

In [None]:
# Получаем эмбеддинг для текста
def document_vector(doc):
    doc = [model.wv[word] for word in doc if word in model.wv.key_to_index.keys()]
    if not doc:
        return [0 for i in range(VECTOR_SIZE)]
    return np.mean(doc, axis=0)

X = list(X_train.apply(document_vector))
X_test_ = list(X_test.apply(document_vector))

In [40]:
clf = LogisticRegression()
clf.fit(X, y_train)

y_pred = clf.predict(X_test_)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       616
           1       0.99      0.92      0.95       134

    accuracy                           0.98       750
   macro avg       0.99      0.96      0.97       750
weighted avg       0.98      0.98      0.98       750



И для Word2Vec CBOW, и для FastText эмбеддингов логистическая регрессия выдала примерно одинаковый результат.