In [None]:
!pip install --upgrade torch

In [1]:
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score


import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F

import random

In [2]:
RST = 0
df = pd.read_csv("./compare_manual_manuallVSsemiauto/train_manualVSsemiauto_rst_{}.csv".format(RST))
df_val = pd.read_csv("./compare_manual_manuallVSsemiauto/val_manual_only_rst_{}.csv".format(RST))
df_test = pd.read_csv("./compare_manual_manuallVSsemiauto/test_manual_only_rst_{}.csv".format(RST))

In [3]:
necessary_columns = list(df.columns)[1:] 
topics_list = necessary_columns + ['none']
topics_list

['offline_crime',
 'online_crime',
 'drugs',
 'gambling',
 'pornography',
 'prostitution',
 'slavery',
 'suicide',
 'terrorism',
 'weapons',
 'body_shaming',
 'health_shaming',
 'politics',
 'racism',
 'religion',
 'sexual_minorities',
 'sexism',
 'social_injustice',
 'none']

In [5]:
def get_labels(dataframe):
    labels =[]
    for i, el in dataframe.iterrows():
        current_sample_labels = []
        any_class = False
        for clm in necessary_columns:
            if el[clm] == 1:
                any_class = True
                current_sample_labels.append(clm)
        if any_class == False:
            current_sample_labels.append("none")
#         random_label = random.sample(current_sample_labels,1)[0]
#         labels.append(random_label)
    
        current_sample_labels = ','.join(current_sample_labels)
        labels.append(current_sample_labels)
        
    return labels
train_labels = get_labels(df)
val_labels = get_labels(df_val)
test_labels = get_labels(df_test)

In [6]:
df = pd.DataFrame({'text':list(df['text']), 'labels':train_labels})
df_val = pd.DataFrame({'text':list(df_val['text']), 'labels':val_labels})
df_test = pd.DataFrame({'text':list(df_test['text']), 'labels':test_labels})

In [7]:
mapping = dict()
mapping['none'] = 0

for label in train_labels:
    if label not in mapping:
        mapping[label] = len(mapping)

for label in test_labels:
    if label not in mapping:
        mapping[label] = len(mapping)
        
for label in val_labels:
    if label not in mapping:
        mapping[label] = len(mapping)

In [8]:
df['class'] = df['labels'].apply(lambda x: mapping[x])
df_test['class'] = df_test['labels'].apply(lambda x: mapping[x])
df_val['class'] = df_val['labels'].apply(lambda x: mapping[x])

In [9]:
# train_val = pd.concat([df,df_val])

In [10]:
# train_val.head()

In [11]:
x_train = df['text'].tolist()
y_train = df['class'].tolist()

x_val = df_val['text'].tolist()
y_val = df_val['class'].tolist()

x_test = df_test['text'].tolist()
y_test = df_test['class'].tolist()

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz

In [None]:
!gunzip cc.ru.300.vec.gz


In [12]:
pad_token = 'PAD'

vocab = dict()
embeddings = list()

vocab_size = 400000
embedding_dim = 300

vocab[pad_token] = len(vocab)
embeddings.append(np.zeros(embedding_dim))

In [13]:
with open('cc.ru.300.vec', 'r') as f:
    a = f.readline()
    for line in f:
        parts = line.strip().split()
        token = ' '.join(parts[:-embedding_dim])
        if token in vocab:
            continue
        word_vector = np.array(list(map(float, parts[-embedding_dim:])))
        
        vocab[token] = len(vocab)
        embeddings.append(word_vector)
        
        if len(vocab) == vocab_size:
            break

In [14]:
embeddings = np.stack(embeddings)
embeddings.shape

(400000, 300)

In [15]:
max_len = 45

In [16]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, vocab, max_len, pad_index = 0):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        self.pad_index = pad_index
        
        self.vocab = vocab

    def __len__(self):
        
        return len(self.texts)
    
    
    def tokenization(self, text):
        
        tokens = wordpunct_tokenize(text)        
        token_indices = [self.vocab[tok] for tok in tokens if tok in self.vocab]
        
        return token_indices
    
    def padding(self, text):
        
        text = text[:self.max_len]        
        text += [self.pad_index] * (self.max_len - len(text))        
        return text

    
    def __getitem__(self, index):
        x = self.texts[index]
        x = self.tokenization(x)
        x = self.padding(x)
        
        y = self.targets[index]
        
        x = torch.tensor(x).long()
        y = torch.tensor(y).float()
        
        return x, y

In [17]:
train_dataset = UnsafeData(x_train, y_train, vocab, max_len)
val_dataset = UnsafeData(x_val, y_val, vocab, max_len)
test_dataset = UnsafeData(x_test, y_test, vocab, max_len)

In [18]:
test_dataset.__getitem__(0)

(tensor([   18, 30755,    78,    12,  5738,    28,   202, 87566, 49430,    57,
          2148,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]), tensor(1.))

In [19]:
len(train_dataset), len(test_dataset)

(33100, 1585)

In [20]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle = False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle = False)

In [21]:
for x, y in train_loader:
    break
x.shape, y.shape

(torch.Size([32, 45]), torch.Size([32]))

In [22]:
for x, y in val_loader:
    break
x.shape, y.shape

(torch.Size([32, 45]), torch.Size([32]))

In [23]:
embeddings = torch.tensor(embeddings).float()

In [24]:
class RCNN(nn.Module):
    
    def __init__(self, embeddings, embedding_dim, hidden_size, hidden_size_linear, class_num, dropout, n_layers):
        super(RCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True, dropout=dropout, num_layers=n_layers)
        self.W = nn.Linear(embedding_dim + 2*hidden_size, hidden_size_linear)
        self.fc = nn.Linear(hidden_size_linear, class_num)
        self.act = nn.Softmax()

        
    def forward(self, x):
        # x = |bs, seq_len|
        x_emb = self.embedding(x)
        # x_emb = |bs, seq_len, embedding_dim|
        output, _ = self.lstm(x_emb)
        # output = |bs, seq_len, 2*hidden_size|
        output = torch.cat([output, x_emb], 2)
        # output = |bs, seq_len, embedding_dim + 2*hidden_size|
        output = self.W(output).transpose(1, 2)
        # output = |bs, seq_len, hidden_size_linear| -> |bs, hidden_size_linear, seq_len|
        output = F.max_pool1d(output, output.size(2)).squeeze(2)
        # output = |bs, hidden_size_linear|
        output = self.fc(output)
        return output
        # output = |bs, class_num|
#         return torch.argmax(self.act(output),dim=1)
#         return self.act(output)

In [25]:
model = RCNN(
    embeddings = embeddings,
    embedding_dim = 300,
    hidden_size = 300,
    hidden_size_linear = 128,
    class_num = len(mapping),
    n_layers = 4,
    dropout = 0.5
)

In [26]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

device = torch.device('cuda:7')
model.to(device);

x = x.to(device)
y = y.to(device)
out = model(x).squeeze()
y = y.long()
# loss(out, y)

In [27]:
int(x[0][0].cpu())

2555

In [28]:
target_vaiables_id2topic_dict = {val:key for key, val in mapping.items()}

def adjust_multilabel_onedim(y, is_pred):
    y_adjusted = []
    for y_c in y:
        y_test_curr = [0]*19
        
        if is_pred == False:
            y_c = target_vaiables_id2topic_dict[int(y_c.cpu())]
        else:
#             print(y_c)
            m = nn.Softmax(dim=0)
            y_c = torch.argmax(m(y_c))
#             print(y_c)
            y_c = int(y_c.cpu())
#             print(y_c)
            y_c = target_vaiables_id2topic_dict[y_c]
#             print(y_c)
        for tag in y_c.split(","):
            topic_index = topics_list.index(tag)
            y_test_curr[topic_index] = 1
        y_adjusted.append(y_test_curr)
#         break
    return y_adjusted

def metrics(labels, pred):
    
    labels = adjust_multilabel_onedim(labels,is_pred = False)
    pred = adjust_multilabel_onedim(pred, is_pred = True)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted', zero_division = 0)
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [31]:
len(target_vaiables_id2topic_dict)

337

In [29]:
def train(model, loader, optimizer, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Train')

    model.train()

    for x, y in loader:

        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        
        yhat = model(x).squeeze()
        
#         yhat = yhat.long() 
        y = y.long()
    
#         print(y)
#         print(yhat)
    
        loss = criterion(yhat, y)
        loss.backward()
        optimizer.step()
        
        cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


        losses.append(loss.item())
        f_scores.append(cur_metrics['f1'])
        accuracy_scores.append(cur_metrics['accuracy'])
        precision_scores.append(cur_metrics['precision'])
        recall_scores.append(cur_metrics['recall'])
        #roc_auc_scores.append(cur_metrics['roc_auc'])
        

        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
        progress_bar.update()

    progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [None]:
train_metrics = train(model, train_loader, optimizer, criterion)

In [37]:
tqdm._instances.clear()


In [34]:
def evaluate(model, loader, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Eval')

    model.eval()
    with torch.no_grad():

        for x, y in loader:

            x = x.to(device)
            y = y.to(device)
            y = y.long()
            yhat = model(x).squeeze()
        
            loss = criterion(yhat, y)
        
            cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


            losses.append(loss.item())
            f_scores.append(cur_metrics['f1'])
            accuracy_scores.append(cur_metrics['accuracy'])
            precision_scores.append(cur_metrics['precision'])
            recall_scores.append(cur_metrics['recall'])
            #roc_auc_scores.append(cur_metrics['roc_auc'])
        

            progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
            progress_bar.update()

        progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [35]:
save_best_model_path = './ft_model/best_model_state_dict.pth'
save_best_optimizer_path = './ft_model/best_optimizer_state_dict.pth'

In [36]:
n_epochs = 7
best_valid_loss = float('inf')
patience = 0
for epoch in range(n_epochs):
     
    #train the model
    train_metrics = train(model, train_loader, optimizer, criterion)
    
    #evaluate the model
    valid_metrics = evaluate(model, val_loader, criterion)
    
#     print(train_metrics)
    print(valid_metrics)
    
    #save the best model
    if valid_metrics['loss'] < best_valid_loss:
        best_valid_loss = valid_metrics['loss']
        torch.save(model.state_dict(), save_best_model_path)
        torch.save(optimizer.state_dict(), save_best_optimizer_path)
    else:
        patience +=1
        if patience>3:
            break

Train: 100%|██████████| 1035/1035 [00:42<00:00, 24.48it/s, accuracy=0.665, f1=0.665, loss=1.36]
Eval: 100%|██████████| 46/46 [00:00<00:00, 65.81it/s, accuracy=0.468, f1=0.648, loss=2.29]
Train:  78%|███████▊  | 811/1035 [02:24<00:39,  5.63it/s, accuracy=0.376, f1=0.304, loss=2.44]


{'loss': 2.2942951464134715, 'f_score': 0.6476643121452252, 'accuracy': 0.468070652173913, 'precision': 0.9055675643453895, 'recall': 0.5534243581898879}


Train: 100%|██████████| 1035/1035 [00:42<00:00, 24.08it/s, accuracy=0.691, f1=0.69, loss=1.22] 
Eval: 100%|██████████| 46/46 [00:00<00:00, 55.16it/s, accuracy=0.478, f1=0.667, loss=2.18]


{'loss': 2.1787300032118093, 'f_score': 0.6672042743688312, 'accuracy': 0.47758152173913043, 'precision': 0.9094117877113268, 'recall': 0.5640021778689861}


Train: 100%|██████████| 1035/1035 [00:41<00:00, 25.03it/s, accuracy=0.704, f1=0.709, loss=1.12]
Eval: 100%|██████████| 46/46 [00:00<00:00, 64.44it/s, accuracy=0.469, f1=0.67, loss=2.19] 
Train:   0%|          | 3/1035 [00:00<00:41, 24.62it/s, accuracy=0.734, f1=0.713, loss=1.1]

{'loss': 2.1890755820533503, 'f_score': 0.6701343793667188, 'accuracy': 0.46875, 'precision': 0.9215122212688925, 'recall': 0.5614271161580843}


Train: 100%|██████████| 1035/1035 [00:41<00:00, 24.92it/s, accuracy=0.711, f1=0.72, loss=1.09] 
Eval: 100%|██████████| 46/46 [00:00<00:00, 62.17it/s, accuracy=0.493, f1=0.689, loss=2.12] 


{'loss': 2.1203996234613918, 'f_score': 0.6888646255161361, 'accuracy': 0.49252717391304346, 'precision': 0.9248144741245328, 'recall': 0.5855448220162642}


Train: 100%|██████████| 1035/1035 [00:43<00:00, 23.53it/s, accuracy=0.73, f1=0.734, loss=0.97]  
Eval: 100%|██████████| 46/46 [00:00<00:00, 66.39it/s, accuracy=0.503, f1=0.693, loss=2.09]


{'loss': 2.085290964530862, 'f_score': 0.692756547981403, 'accuracy': 0.5027173913043478, 'precision': 0.9191347107315866, 'recall': 0.5929792098328497}


Train: 100%|██████████| 1035/1035 [00:46<00:00, 22.39it/s, accuracy=0.744, f1=0.752, loss=0.931]
Eval: 100%|██████████| 46/46 [00:00<00:00, 62.73it/s, accuracy=0.51, f1=0.703, loss=2.05] 


{'loss': 2.0507029152434804, 'f_score': 0.7030529231299978, 'accuracy': 0.5101902173913043, 'precision': 0.9247956187285027, 'recall': 0.6060868953703629}


Train: 100%|██████████| 1035/1035 [00:42<00:00, 24.46it/s, accuracy=0.744, f1=0.755, loss=0.914]
Eval: 100%|██████████| 46/46 [00:00<00:00, 63.56it/s, accuracy=0.499, f1=0.684, loss=2.11]

{'loss': 2.1127007376888525, 'f_score': 0.6835244690323752, 'accuracy': 0.4986413043478261, 'precision': 0.9224296665111421, 'recall': 0.5848369636503938}





In [None]:
{'loss': 1.4320457201936971, 'f_score': 0.6779545482652473, 'accuracy': 0.6073369565217391, 'precision': 0.8145186363681608, 'recall': 0.6073369565217391}

In [42]:
model.load_state_dict(torch.load(save_best_model_path))

<All keys matched successfully>

In [38]:
evaluate(model, test_loader, criterion)

Eval: 100%|██████████| 50/50 [00:00<00:00, 59.96it/s, accuracy=0.478, f1=0.673, loss=2.53]


{'loss': 2.5312773221731186,
 'f_score': 0.6730060751423094,
 'accuracy': 0.4779044117647059,
 'precision': 0.8930062045970715,
 'recall': 0.5808135202265351}

In [None]:
много семплов на класс 
{'loss': 2.5312773221731186,
 'f_score': 0.6730060751423094,
 'accuracy': 0.4779044117647059,
 'precision': 0.8930062045970715,
 'recall': 0.5808135202265351}

In [None]:
test на 1семпл - 1 класс
{'loss': 1.4456687247753144,
 'f_score': 0.640342492941638,
 'accuracy': 0.5839338235294117,
 'precision': 0.7558313916440501,
 'recall': 0.5839338235294117