In [None]:
import pandas as pd
import os
import numpy as np

import pandas as pd
from nltk.tokenize import wordpunct_tokenize
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score


import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F

In [None]:
df = pd.read_csv("Inappropriate_09_top_vs_one_with_multi.csv")


In [None]:
df.head()

In [None]:
THRESHOLD = 0.75
df_unsafe = df[df['inappropriate'] > THRESHOLD]
df_safe = df[df['inappropriate'] <= THRESHOLD]

In [None]:
for rs in [1,2,3,4]:
    df_unsafe_shuf = df_unsafe.sample(frac=1, random_state = rs).reset_index(drop=True)
    split_train = int(len(df_unsafe_shuf) * 0.8)
    split_val = int(len(df_unsafe_shuf) * 0.9)
    df_unsafe_shuf_train = df_unsafe_shuf[:split_train]
    df_unsafe_shuf_val = df_unsafe_shuf[split_train:split_val]
    df_unsafe_shuf_test= df_unsafe_shuf[split_val:]

    df_safe_shuf = df_safe.sample(frac=1, random_state = rs).reset_index(drop=True)
    split_train = int(len(df_safe_shuf) * 0.8)
    split_val = int(len(df_safe_shuf) * 0.9)
    df_safe_shuf_train = df_safe_shuf[:split_train]
    df_safe_shuf_val = df_safe_shuf[split_train:split_val]
    df_safe_shuf_test = df_safe_shuf[split_val:]

    df_tr = pd.concat([df_unsafe_shuf_train, df_safe_shuf_train])
    df_val = pd.concat([df_unsafe_shuf_val, df_safe_shuf_val])
    df_test = pd.concat([df_unsafe_shuf_test, df_safe_shuf_test])
    
    train_path = "train_randst{}.csv".format(rs)
    val_path = "val_randst{}.csv".format(rs)
    test_path = "test_randst{}.csv".format(rs)
    
    train_path = os.path.join("./splits",train_path)
    val_path = os.path.join("./splits",val_path)
    test_path = os.path.join("./splits",test_path)
    
    df_tr.to_csv(train_path, index = None)
    df_val.to_csv(val_path, index = None)
    df_test.to_csv(test_path, index = None)


In [None]:
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
# !gunzip cc.ru.300.vec.gz


In [None]:
RST = 1
train_df = pd.read_csv('./splits/train_randst{}.csv'.format(RST))
val_df = pd.read_csv('./splits/val_randst{}.csv'.format(RST))
test_df = pd.read_csv('./splits/test_randst{}.csv'.format(RST))
                     
train_df = train_df[["text", "inappropriate"]]
val_df = test_df[["text", "inappropriate"]]
test_df = test_df[["text", "inappropriate"]]

for d in [train_df, val_df, test_df]:
    d['inappropriate'] = d['inappropriate'].apply(round)


In [None]:
pad_token = 'PAD'

vocab = dict()
embeddings = list()

vocab_size = 400000
embedding_dim = 300

vocab[pad_token] = len(vocab)
embeddings.append(np.zeros(embedding_dim))

In [None]:

with open('cc.ru.300.vec', 'r') as f:
    a = f.readline()
    for line in f:
        parts = line.strip().split()
        token = ' '.join(parts[:-embedding_dim])
        if token in vocab:
            continue
        word_vector = np.array(list(map(float, parts[-embedding_dim:])))
        
        vocab[token] = len(vocab)
        embeddings.append(word_vector)
        
        if len(vocab) == vocab_size:
            break

In [None]:

embeddings = np.stack(embeddings)
embeddings.shape

In [None]:
max_len = 128


In [None]:
x_train = train_df['text'].tolist()
y_train = train_df['inappropriate'].tolist()
x_test = test_df['text'].tolist()
y_test = test_df['inappropriate'].tolist()

In [None]:
class UnsafeData(Dataset):

    def __init__(self, texts, targets, vocab, max_len, pad_index = 0):
        
        super().__init__()
        
        self.texts = texts
        self.targets = targets        
        self.max_len = max_len
        self.pad_index = pad_index
        
        self.vocab = vocab

    def __len__(self):
        
        return len(self.texts)
    
    
    def tokenization(self, text):
        
        tokens = wordpunct_tokenize(text)        
        token_indices = [self.vocab[tok] for tok in tokens if tok in self.vocab]
        
        return token_indices
    
    def padding(self, text):
        
        text = text[:self.max_len]        
        text += [self.pad_index] * (self.max_len - len(text))        
        return text

    
    def __getitem__(self, index):
        x = self.texts[index]
        y = self.targets[index]
        
        x = self.tokenization(x)
        x = self.padding(x)
        
        x = torch.tensor(x).long()
        y = torch.tensor(y).float()
        
        return x, y

In [None]:
train_dataset = UnsafeData(x_train, y_train, vocab, max_len)
val_dataset = UnsafeData(x_train, y_train, vocab, max_len)
test_dataset = UnsafeData(x_test, y_test, vocab, max_len)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle = False)

In [None]:
for x, y in test_loader:
    break

x.shape, y.shape

In [None]:
embeddings = torch.tensor(embeddings).float()


In [None]:

class RCNN(nn.Module):

    
    def __init__(self, embeddings, embedding_dim, hidden_size, hidden_size_linear, class_num, dropout, n_layers):
        super(RCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True, dropout=dropout, num_layers=n_layers)
        self.W = nn.Linear(embedding_dim + 2*hidden_size, hidden_size_linear)
        self.fc = nn.Linear(hidden_size_linear, class_num)
        
        self.act = nn.Sigmoid()

        
    def forward(self, x):
        # x = |bs, seq_len|
        x_emb = self.embedding(x)
        # x_emb = |bs, seq_len, embedding_dim|
        output, _ = self.lstm(x_emb)
        # output = |bs, seq_len, 2*hidden_size|
        output = torch.cat([output, x_emb], 2)
        # output = |bs, seq_len, embedding_dim + 2*hidden_size|
        output = self.W(output).transpose(1, 2)
        # output = |bs, seq_len, hidden_size_linear| -> |bs, hidden_size_linear, seq_len|
        output = F.max_pool1d(output, output.size(2)).squeeze(2)
        # output = |bs, hidden_size_linear|
        output = self.fc(output)
        # output = |bs, class_num|
        return self.act(output)

In [None]:
model = RCNN(
    embeddings = embeddings,
    embedding_dim = 300,
    hidden_size = 300,
    hidden_size_linear = 128,
    class_num = 1,
    n_layers = 4,
    dropout = 0.5
)

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()

In [None]:
def metrics(true, predictions):
    
    rounded_preds = torch.round(predictions)
    
    precision, recall, f1, _ = precision_recall_fscore_support(true, rounded_preds, average='weighted', zero_division = 0)
    acc = accuracy_score(true, rounded_preds)
    #roc_auc = roc_auc_score(true, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
        #'roc_auc': roc_auc
    }

In [None]:
device = torch.device('cuda:1')
# device = torch.device('cpu')
model.to(device);

In [None]:
def train(model, loader, optimizer, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Train')

    model.train()

    for x, y in loader:

        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        
        yhat = model(x).squeeze()
        
        loss = criterion(yhat, y)
        loss.backward()
        optimizer.step()
        
        cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


        losses.append(loss.item())
        f_scores.append(cur_metrics['f1'])
        accuracy_scores.append(cur_metrics['accuracy'])
        precision_scores.append(cur_metrics['precision'])
        recall_scores.append(cur_metrics['recall'])
        #roc_auc_scores.append(cur_metrics['roc_auc'])
        

        progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
        progress_bar.update()

    progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [None]:
tqdm._instances.clear()


In [None]:

def evaluate(model, loader, criterion, last_n_losses=200, verbose=True):

    losses = []
    f_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    #roc_auc_scores = []

    progress_bar = tqdm(total=len(loader), disable=not verbose, desc='Eval')

    model.eval()
    with torch.no_grad():

        for x, y in loader:

            x = x.to(device)
            y = y.to(device)
        
            yhat = model(x).squeeze()
        
            loss = criterion(yhat, y)
        
            cur_metrics = metrics(y.cpu(), yhat.detach().cpu())


            losses.append(loss.item())
            f_scores.append(cur_metrics['f1'])
            accuracy_scores.append(cur_metrics['accuracy'])
            precision_scores.append(cur_metrics['precision'])
            recall_scores.append(cur_metrics['recall'])
            #roc_auc_scores.append(cur_metrics['roc_auc'])
        

            progress_bar.set_postfix(loss=np.mean(losses[-last_n_losses:]), f1=np.mean(f_scores[-last_n_losses:]),
                                accuracy=np.mean(accuracy_scores[-last_n_losses:]))
        
            progress_bar.update()

        progress_bar.close()
    
    return {'loss': np.sum(losses)/len(loader), 'f_score': np.sum(f_scores)/len(loader), 'accuracy': np.sum(accuracy_scores)/len(loader),
           'precision': np.sum(precision_scores)/len(loader), 'recall': np.sum(recall_scores)/len(loader)}

In [None]:
save_best_model_path = 'best_model_state_dict.pth'
save_best_optimizer_path = 'best_optimizer_state_dict.pth'

In [None]:
n_epochs = 7
best_valid_loss = float('inf')
patience = 0
for epoch in range(n_epochs):
     
    #train the model
    train_metrics = train(model, train_loader, optimizer, criterion)
    
    #evaluate the model
    valid_metrics = evaluate(model, test_loader, criterion)
    
    print(train_metrics)
    print(valid_metrics)
    
    #save the best model
    if valid_metrics['loss'] < best_valid_loss:
        best_valid_loss = valid_metrics['loss']
        torch.save(model.state_dict(), save_best_model_path)
        torch.save(optimizer.state_dict(), save_best_optimizer_path)
    else:
        patience +=1
        if patience>3:
            break