*Подключаем необходимые библиотеки*

In [1]:
import warnings
from string import punctuation
import os

import numpy as np
import pandas as pd

from gensim.models import KeyedVectors, Word2Vec

from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, train_test_split

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import DataLoader, Dataset

import wandb

warnings.simplefilter("ignore")

*Создаём необходимые папки и загружаем в них необходимые данные*

In [2]:
folders = ["spam-emails", "pretrained-w2v", "w2v-text"]
for folder in folders:
    os.makedirs(folder)

In [3]:
!gdown 151X3grlutx4K63yEQ2Pr3yvIF5U--KAd -O spam-emails/

Downloading...
From (original): https://drive.google.com/uc?id=151X3grlutx4K63yEQ2Pr3yvIF5U--KAd
From (redirected): https://drive.google.com/uc?id=151X3grlutx4K63yEQ2Pr3yvIF5U--KAd&confirm=t&uuid=8d402955-8ce2-4e79-9fa4-63cf35b4ac12
To: /kaggle/working/spam-emails/email.csv
100%|████████████████████████████████████████| 140M/140M [00:03<00:00, 40.0MB/s]


In [4]:
!gdown 1Jjhqvzt7lo3_QphDi3W1e5WNHM_265hz -O pretrained-w2v/

Downloading...
From (original): https://drive.google.com/uc?id=1Jjhqvzt7lo3_QphDi3W1e5WNHM_265hz
From (redirected): https://drive.google.com/uc?id=1Jjhqvzt7lo3_QphDi3W1e5WNHM_265hz&confirm=t&uuid=12d68a4b-4edb-4d6a-85c7-1ac28e7a9d15
To: /kaggle/working/pretrained-w2v/GoogleNews-vectors-negative300.bin
100%|██████████████████████████████████████| 3.64G/3.64G [01:12<00:00, 49.9MB/s]


In [5]:
!gdown 1B4a9reN9HB9dPKEPPQrgNraxwiz9OkAs -O w2v-text/

Downloading...
From (original): https://drive.google.com/uc?id=1B4a9reN9HB9dPKEPPQrgNraxwiz9OkAs
From (redirected): https://drive.google.com/uc?id=1B4a9reN9HB9dPKEPPQrgNraxwiz9OkAs&confirm=t&uuid=7843484a-961e-4cef-aef6-98c71df7127a
To: /kaggle/working/w2v-text/train.txt
100%|████████████████████████████████████████| 207M/207M [00:04<00:00, 41.7MB/s]


*Подключаем wandb*

In [6]:
with open("wandb-login/wandb_api.txt", 'r') as file:
    login = file.read()
    wandb.login(key=login)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mda-shumilin03[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


*Выводим 5 случайных строк из данных, которые будем позже предсказывать, смотрим на размеры данных и соотношение классов*

In [7]:
df = pd.read_csv("/kaggle/input/spam-emails/email.csv")
df.sample(5)

Unnamed: 0,label,text
65703,1,hewi escapelong escapelong escapelong escapelo...
79008,1,comes thats reached truly bought nothing cant ...
7979,1,look at these\nimages . can ' t view them ? to...
42198,1,dear customer tired of paying twice the price ...
65342,0,"there is an electrical problem at the plant , ..."


In [8]:
df.shape

(83448, 2)

In [9]:
df["label"].value_counts(normalize=True)

label
1    0.526196
0    0.473804
Name: proportion, dtype: float64

*Создаём функцию токенизации, удаляя пунктуацию и лишние слова из предложений*

In [10]:
def tokenization(text):
    text = text.lower()
    for p in punctuation:
        text = text.replace(p, '')
    tokins = [word for word in text.strip().split() if word not in stopwords.words("english")]
    return tokins

In [11]:
string = "The mystery of human existence lies not in just staying alive, but in finding something to live for."
tokenization(string)

['mystery',
 'human',
 'existence',
 'lies',
 'staying',
 'alive',
 'finding',
 'something',
 'live']

*Загружаем данные, на которых будем обучать Word2Vec модель*

In [12]:
texts = []
with open("w2v-text/text.txt", 'r', encoding="utf-8", errors="ignore") as f:
    for line in f.readlines():
        texts.append(line)

In [13]:
texts[0]

'Soon we dropped into a living forest, where cold-tolerant evergreens and boreal animals still evoke the Canadian heritage of an ecosystem pushed south by glaciers 20,000 years ago.\n'

*Токенизируем данные*

In [14]:
data = []
for text in tqdm(texts):
    data.append(tokenization(text))

100%|██████████| 1534699/1534699 [51:17<00:00, 498.67it/s] 


In [15]:
data[0]

['soon',
 'dropped',
 'living',
 'forest',
 'coldtolerant',
 'evergreens',
 'boreal',
 'animals',
 'still',
 'evoke',
 'canadian',
 'heritage',
 'ecosystem',
 'pushed',
 'south',
 'glaciers',
 '20000',
 'years',
 'ago']

*Обучаем модель и смотрим на её ответы на стандартных тестах*

In [16]:
my_model = Word2Vec(sentences=data, vector_size=100, window=7, min_count=10, workers=4)

In [17]:
my_model.wv.most_similar("man", topn=5)

[('mans', 0.8031670451164246),
 ('woman', 0.7638831734657288),
 ('jew', 0.738024890422821),
 ('cæsar', 0.7193751931190491),
 ('servant', 0.6952049732208252)]

In [18]:
my_model.wv.most_similar(positive=["king", "woman"], negative=["queen"], topn=5)

[('man', 0.7098705172538757),
 ('teenager', 0.6254411339759827),
 ('girl', 0.6235789060592651),
 ('men', 0.6173486113548279),
 ('husband', 0.6146592497825623)]

In [19]:
df["text"].values[0]

'ounce feather bowl hummingbird opec moment alabaster valkyrie dyad bread flack desperate iambic hadron heft quell yoghurt bunkmate divert afterimage'

*Токенизируем данные, которые будем предсказывать*

In [20]:
email_sentence = []
for text in tqdm(df["text"].values):
    email_sentence.append(tokenization(text))

100%|██████████| 83448/83448 [33:11<00:00, 41.91it/s]  


In [21]:
email_sentence[0], len(email_sentence[0])

(['ounce',
  'feather',
  'bowl',
  'hummingbird',
  'opec',
  'moment',
  'alabaster',
  'valkyrie',
  'dyad',
  'bread',
  'flack',
  'desperate',
  'iambic',
  'hadron',
  'heft',
  'quell',
  'yoghurt',
  'bunkmate',
  'divert',
  'afterimage'],
 20)

*Создаём функцию для получения эмбедингов*

In [22]:
def get_embeddings(model, pretrained):
    emails_embedding = []
    for text in email_sentence:
        sentence = []
        for word in text:
            if (pretrained):
                if (word in model):
                    sentence.append(model[word])
            else:
                if (word in model.wv):
                    sentence.append(model.wv[word])
        emails_embedding.append(sentence.copy())
    return emails_embedding

In [23]:
emails_embedding_my_model = get_embeddings(my_model, pretrained=0)

In [24]:
len(emails_embedding_my_model[0])

14

*Так как для начала мы будем обучать модель, которая принимает лишь вектор в качестве данных, то схлопнем всё предложение в один эмбединг путём усреднения*

In [25]:
def get_mean_embeddings(model, emails_embedding):
    emails_embedding_mean = np.zeros((len(emails_embedding), model.vector_size))
    for index, value in enumerate(emails_embedding):
        emails_embedding_mean[index] = np.mean(value, axis=0)
        
    return emails_embedding_mean

In [26]:
emails_embedding_my_model_mean = get_mean_embeddings(my_model, emails_embedding_my_model)

In [27]:
emails_embedding_my_model_mean[0]

array([-0.06995446, -0.03608357,  0.21911904,  0.02696909, -0.28153101,
       -0.43365574,  0.44424745,  0.34215495,  0.09337538, -0.02183237,
       -0.34769979, -0.05441582,  0.23229744,  0.2380842 ,  0.52173048,
        0.26985884,  0.17581484,  0.01251382,  0.13886508, -0.1977465 ,
        0.57096809,  0.13123146,  0.19768038,  0.36109486,  0.25478584,
        0.03856817, -0.44804293, -0.07417977, -0.40217406, -0.19180344,
        0.53037864,  0.19885656,  0.41454539, -0.46947023,  0.2688303 ,
        0.49610329,  0.36827216, -0.01727464, -0.20326376,  0.26556185,
       -0.18026689,  0.1116833 , -0.41199657, -0.19726343,  0.1108924 ,
        0.25960016, -0.5692293 ,  0.19312397, -0.04917172,  0.17095403,
        0.58106393,  0.19432493, -0.22373864,  0.26796582, -0.25368229,
       -0.2807962 , -0.23114415, -0.13355149, -0.4099535 , -0.01971128,
        0.0444412 ,  0.51702726,  0.41682008,  0.09203938,  0.16826762,
        0.55892646,  0.15382159,  0.11359175, -0.46055284, -0.03

*Создаём функцию для удаления объектов, у которых присутствуют Nan значения*

In [28]:
def remove_nan(emails_embedding):
    nan_indices = np.isnan(emails_embedding).any(axis=1)
    indexs = np.where(nan_indices)[0]
    emails_embedding_not_nan = np.delete(emails_embedding, indexs, axis=0)
    label_not_nan = np.delete(df["label"].values, indexs)
    
    return emails_embedding_not_nan, label_not_nan

In [29]:
emails_embedding_my_model_mean_not_nan, label_my_model_mean_not_nan = remove_nan(emails_embedding_my_model_mean)

In [30]:
emails_embedding_my_model_mean_not_nan.shape, label_my_model_mean_not_nan.shape

((83226, 100), (83226,))

*Обучаем модели логистической регрессии и метода опорных векторов с кросс-валидацией*

In [31]:
scoring = ["accuracy", "precision", "recall", "f1"]

In [32]:
cross_validate(LogisticRegression(), emails_embedding_my_model_mean_not_nan, label_my_model_mean_not_nan, cv=5, scoring=scoring, return_train_score=True)

{'fit_time': array([0.73829365, 0.73351431, 0.71153688, 0.70924282, 0.72319293]),
 'score_time': array([0.01082921, 0.01023531, 0.00960588, 0.01632023, 0.00970125]),
 'test_accuracy': array([0.8909648 , 0.89522379, 0.8916191 , 0.89486332, 0.89258035]),
 'train_accuracy': array([0.89414239, 0.89337799, 0.89406888, 0.89312266, 0.89289737]),
 'test_precision': array([0.89640568, 0.90245025, 0.89698981, 0.89788251, 0.89917317]),
 'train_precision': array([0.8996621 , 0.8983046 , 0.89928181, 0.89887415, 0.89828277]),
 'test_recall': array([0.89589292, 0.89749457, 0.89657934, 0.90240275, 0.89588101]),
 'train_recall': array([0.8986328 , 0.89869001, 0.89894743, 0.89743443, 0.89769185]),
 'test_f1': array([0.89614922, 0.89996558, 0.89678453, 0.90013696, 0.89752407]),
 'train_f1': array([0.89914716, 0.89849726, 0.89911459, 0.89815371, 0.89798721])}

In [33]:
cross_validate(SGDClassifier(), emails_embedding_my_model_mean_not_nan, label_my_model_mean_not_nan, cv=5, scoring=scoring, return_train_score=True)

{'fit_time': array([0.53730083, 0.53835297, 0.50047731, 0.55367112, 0.57649255]),
 'score_time': array([0.01087046, 0.01020241, 0.01080799, 0.01694512, 0.01089573]),
 'test_accuracy': array([0.89108495, 0.89462301, 0.89197957, 0.89390207, 0.88867528]),
 'train_accuracy': array([0.8944578 , 0.89205629, 0.89424911, 0.8915156 , 0.88935282]),
 'test_precision': array([0.89724771, 0.89586402, 0.90361586, 0.90284196, 0.91195119]),
 'train_precision': array([0.90013464, 0.89282381, 0.90572234, 0.90362007, 0.91156118]),
 'test_recall': array([0.89509209, 0.90447317, 0.88914312, 0.89416476, 0.8721968 ]),
 'train_recall': array([0.89871861, 0.90280876, 0.89139637, 0.88813889, 0.87409547]),
 'test_f1': array([0.8961686 , 0.90014801, 0.89632107, 0.89848241, 0.89163109]),
 'train_f1': array([0.89942607, 0.89778852, 0.89850226, 0.8958126 , 0.89243528])}

*Получаем очень неплохие модели, которые не переобучены и на Test дают accuracy около 0.89*

*Попробуем применить модель Word2Vec, обученную на гораздо большем количестве данных*

In [34]:
model_pretrained = KeyedVectors.load_word2vec_format("/kaggle/input/pretrained-w2v/GoogleNews-vectors-negative300.bin", binary=True)

In [35]:
model_pretrained.most_similar("man", topn=5)

[('woman', 0.7664011716842651),
 ('boy', 0.6824870109558105),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686)]

In [36]:
model_pretrained.most_similar(positive=["king", "woman"], negative=["queen"], topn=5)

[('man', 0.72110915184021),
 ('boy', 0.5595242977142334),
 ('teenage_girl', 0.513959527015686),
 ('girl', 0.4972155690193176),
 ('teenager', 0.4869248569011688)]

In [37]:
emails_embedding_pretrained_model = get_embeddings(model_pretrained, pretrained=1)

In [38]:
emails_embedding_pretrained_model_mean = get_mean_embeddings(model_pretrained, emails_embedding_pretrained_model)

In [39]:
emails_embedding_pretrained_model_mean_not_nan, label_pretrained_model_mean_not_nan = remove_nan(emails_embedding_pretrained_model_mean)

In [40]:
emails_embedding_pretrained_model_mean_not_nan.shape, label_pretrained_model_mean_not_nan.shape

((83252, 300), (83252,))

In [41]:
cross_validate(LogisticRegression(), emails_embedding_pretrained_model_mean_not_nan, label_pretrained_model_mean_not_nan, cv=5, scoring=scoring, return_train_score=True)

{'fit_time': array([1.76535702, 1.46164632, 1.76450443, 1.82566357, 1.83776927]),
 'score_time': array([0.01412535, 0.01392889, 0.01350951, 0.01414227, 0.01410055]),
 'test_accuracy': array([0.93988349, 0.94198547, 0.9412012 , 0.94510511, 0.94132132]),
 'train_accuracy': array([0.94352938, 0.94288374, 0.94304976, 0.94249422, 0.94300471]),
 'test_precision': array([0.94496151, 0.94743501, 0.94459073, 0.94753686, 0.94653943]),
 'train_precision': array([0.94689531, 0.94652269, 0.94692345, 0.94656117, 0.94715109]),
 'test_recall': array([0.94031557, 0.94180197, 0.9434027 , 0.94797027, 0.94145226]),
 'train_recall': array([0.94551484, 0.94462867, 0.94451432, 0.94380127, 0.94417288]),
 'test_f1': array([0.94263282, 0.94461009, 0.94399634, 0.94775352, 0.94398899]),
 'train_f1': array([0.94620457, 0.94557473, 0.94571735, 0.94517921, 0.94565964])}

In [42]:
cross_validate(SGDClassifier(), emails_embedding_pretrained_model_mean_not_nan, label_pretrained_model_mean_not_nan, cv=5, scoring=scoring, return_train_score=True)

{'fit_time': array([0.60127854, 0.56091547, 0.55026007, 0.64906001, 0.5532558 ]),
 'score_time': array([0.01396632, 0.01379633, 0.014184  , 0.01400757, 0.01408863]),
 'test_accuracy': array([0.93898264, 0.94198547, 0.94024024, 0.94426426, 0.94216216]),
 'train_accuracy': array([0.9424333 , 0.9425384 , 0.94231404, 0.94103781, 0.94048227]),
 'test_precision': array([0.93682188, 0.94867359, 0.94933333, 0.95400163, 0.93847194]),
 'train_precision': array([0.93867951, 0.94869948, 0.95174955, 0.95168422, 0.93532995]),
 'test_recall': array([0.94774754, 0.94042991, 0.93619941, 0.93916524, 0.95231561]),
 'train_recall': array([0.95263278, 0.94151278, 0.93771082, 0.93522568, 0.95254838]),
 'test_f1': array([0.94225304, 0.94453376, 0.94272063, 0.9465253 , 0.9453431 ]),
 'train_f1': array([0.94560468, 0.94509247, 0.94467803, 0.94338317, 0.94386064])}

*Получаем модель лучше, accuracy возросло до 0.94*

*Теперь попробуем подготовить данные и построить модель рекуррентной нейронной сети*

In [43]:
indexs_nan = []
indexs_null = []
for index, value in enumerate(emails_embedding_my_model):
    if (len(value) != 0):
        if (len(np.where(np.isnan(value).any(axis=1))[0]) != 0):
            indexs_nan.append(index)
    else:
        indexs_null.append(index)

In [44]:
len(indexs_nan), len(indexs_null)

(0, 222)

In [45]:
emails_embedding_not_null = [value for index, value in enumerate(emails_embedding_my_model) if index not in indexs_null]
label_not_null = np.delete(df["label"].values, indexs_null)

In [46]:
len(emails_embedding_not_null), len(label_not_null)

(83226, 83226)

In [47]:
class MyDataSet(Dataset):

    def __init__(self, x, y):
        self.x = [torch.tensor(seq, dtype=torch.float32) for seq in x]
        self.y = torch.tensor(y, dtype=torch.float32)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(emails_embedding_not_null, label_not_null, test_size=0.2, random_state=42)

In [49]:
X_train[0][0], y_train[0]

(array([ 0.0098148 ,  0.07363529,  0.22662139, -0.5966263 , -0.03104346,
         0.05114244,  0.5776808 ,  0.5548911 , -0.08006153,  0.5252783 ,
         0.08469971, -0.6534043 ,  0.17199656, -0.05298574, -0.08600964,
        -0.58952886,  0.591573  , -0.6982628 ,  0.02286943, -0.21059637,
        -0.15980433, -0.01931535,  0.1540565 , -0.36712375, -0.040148  ,
        -0.05691309, -0.02951199,  0.2025809 , -0.09830411,  0.22504479,
         0.1615242 ,  0.2751402 ,  0.13595317, -0.1297282 , -0.00433401,
        -0.0978229 ,  0.19165467,  0.09623048, -0.3340152 , -0.3692188 ,
         0.08176481, -0.09517123, -0.335281  , -0.33291808,  0.36042994,
         0.14975278, -0.34116626,  0.18094395,  0.16744809,  0.23360017,
         0.04645053,  0.13872704, -0.44288316,  0.05418905, -0.46764252,
        -0.00385386,  0.2569195 ,  0.13988818, -0.12370247,  0.09431571,
         0.16898341,  0.14625715, -0.29357985, -0.00227948, -0.3319331 ,
         0.30886388,  0.26787835,  0.2123492 ,  0.2

In [50]:
train_dataset = MyDataSet(X_train, y_train)
test_dataset = MyDataSet(X_test, y_test)

In [51]:
len(train_dataset), len(test_dataset)

(66580, 16646)

*Так как последовательности разной длины, то нужно их подготовить, сделаем это с помощью добавления падингов*

In [52]:
def collate_fn(batch, max_len=30000):
    sequences, labels = zip(*batch)

    sequences = [seq[:max_len] for seq in sequences]
    
    lengths = torch.tensor([len(seq) for seq in sequences])
    sorted_indices = torch.argsort(lengths, descending=True)
    sequences = [sequences[i] for i in sorted_indices]
    labels = torch.tensor([labels[i] for i in sorted_indices])
    lengths = lengths[sorted_indices]

    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)

    sequences_packed = pack_padded_sequence(sequences_padded, lengths, batch_first=True, enforce_sorted=True)

    return sequences_packed, labels, lengths

In [53]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [54]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

*Создаём модель, для начала будет обрабатываться последовательность с помощью RNN, а последнее скрытое состояние подаваться на линейный слой*

In [55]:
class RNNEmailClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(100, 128, 1, batch_first=True, nonlinearity="tanh")
        self.fc = nn.Linear(128, 2)
    
    def forward(self, x, lengths):
        packed_out, hidden = self.rnn(x)
        hidden = hidden[-1]
        output = self.fc(hidden)
        return output

In [56]:
wandb.init(project="Email-Classification", name="RNN")

*Инициализируем модель, оптимизатор, оптимизатор для шага обучения и обучаем модель*

In [57]:
model = RNNEmailClassifier().to(device)
wandb.watch(model)
optimizer = torch.optim.AdamW(model.parameters())
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, total_steps=int(10 * len(train_dataset) / 16))

In [58]:
epochs = 10
for epoch in tqdm(range(epochs)):
    model.train()
    for x_train, y_train, lengths in train_dataloader:
        x_train, y_train = x_train.to(device), y_train.to(device).long()
        y_pred = model(x_train, lengths)
        loss = F.cross_entropy(y_pred, y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    scheduler.step()

    accuracy_array_for_iter = np.zeros(len(test_dataloader))
    model.eval()
    with torch.no_grad():
        for index, value in enumerate(test_dataloader):
            x_test, y_test, lengths = value[0].to(device), value[1].to(device).long(), value[2]
            y_pred = model(x_test, lengths)
            loss = F.cross_entropy(y_pred, y_test)

            accuracy_array_for_iter[index] = (torch.argmax(y_pred, dim=-1) == y_test).cpu().numpy().mean()

        wandb.log(
            {
                "mean accuracy test": accuracy_array_for_iter.mean()
            }
        )
wandb.finish()

100%|██████████| 10/10 [12:15<00:00, 73.52s/it]


0,1
mean accuracy test,▁▄▄▅▇▆▇██▇

0,1
mean accuracy test,0.93016


*С помощью эмбедингов, которые были обучены на собственных данных получилось выбить accuracy 0.93 за 10 эпох обучения*