In [None]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

torch.cuda.is_available()

True

In [None]:
!pip install transformers -qq
!pip install sentencepiece -qq
!pip install numpy requests nlpaug

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Load Dataset

In [None]:
import random
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

from nlpaug.util import Action

In [None]:
SEED = 1234

In [None]:
alpha_common_error = 0.10
alpha_common_error_char = 0.05
aug1_OCR = nac.OcrAug(aug_word_p=alpha_common_error)
aug2_Rins = nac.RandomCharAug(action="insert", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug3_Rsub = nac.RandomCharAug(action="substitute", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug4_Rswa = nac.RandomCharAug(action="swap", aug_word_p=alpha_common_error,aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char) #
aug5_Rdel = nac.RandomCharAug(action="delete", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug6_Kb = nac.KeyboardAug(aug_word_p=alpha_common_error)
aug7_Split = naw.SplitAug(aug_p=alpha_common_error)

In [None]:
random.sample(range(0, 7), 2 - 1)

[6]

In [None]:
def text2augment(text, m):
    output = [text, ]

    temp = random.sample(range(0, 7), m - 1)

    if 0 in temp:
        output.append(*aug1_OCR.augment(text))
    if 1 in temp:
        output.append(*aug2_Rins.augment(text))
    if 2 in temp:
        output.append(*aug3_Rsub.augment(text))
    if 3 in temp:
        output.append(*aug4_Rswa.augment(text))
    if 4 in temp:
        output.append(*aug5_Rdel.augment(text))
    if 5 in temp:
        output.append(*aug6_Kb.augment(text))
    if 6 in temp:
        output.append(*aug7_Split.augment(text))

    return output

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def to_df(x, y):
    d = {"text": x, "label": y}
    return pd.DataFrame(d)

def split_3(df, test_size=0.2, valid_size=0.2):
    _df = df.copy().sample(frac=1).reset_index()
    _df = _df[["text", "label"]]

    x = _df["text"].copy()
    y = _df["label"].copy()
    #split train-test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=y,random_state=SEED)
    # split train-valid
    x, y = x_train, y_train
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=valid_size, stratify=y,random_state=SEED)
    return to_df(x_train, y_train), to_df(x_valid, y_valid), to_df(x_test, y_test)

In [None]:
from itertools import chain

In [None]:
def aug_replicate(y_labels):
    #return list(chain(* [[y]*(2 if y == 0 else 4) for y in y_labels] ))
    #return list(chain(* [[y]*(1 if y == 0 else 3) for y in y_labels] ))
    return list(chain(* [[y]*(1 if y == 0 else 2) for y in y_labels] ))
    #return list(chain(* [[y]*(2 if y == 0 else 3) for y in y_labels] ))


def aug_text(x_text, y_labels):
    #x_text = [ text2augment(x, 2 if y == 0 else 4) for x, y in zip(x_text, y_labels)]
    #x_text = [ text2augment(x, 1 if y == 0 else 3) for x, y in zip(x_text, y_labels)]
    x_text = [ text2augment(x, 1 if y == 0 else 2) for x, y in zip(x_text, y_labels)]
    #x_text = [ text2augment(x, 2 if y == 0 else 3) for x, y in zip(x_text, y_labels)]
    return pd.Series(list(chain(*x_text)), index=None)

def split_3_aug(df, test_size=0.2, valid_size=0.2):
    _df = df.copy().sample(frac=1).reset_index()
    _df = _df[["text", "label"]]

    x = _df["text"].copy()
    y = _df["label"].copy()
    #split train-test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=y,random_state=SEED)
    # augment
    # x_test = aug_text(x_test, y_test)
    # y_test = aug_replicate(y_test)
    # split train-valid
    x, y = x_train, y_train
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=valid_size, stratify=y,random_state=SEED)
    # augment
    x_train = aug_text(x_train, y_train)
    y_train = aug_replicate(y_train)
    x_valid = aug_text(x_valid, y_valid)
    y_valid = aug_replicate(y_valid)

    print(x_valid.shape)
    print("DONE")
    print(len(y_valid))

    print(x_train.shape)
    print("DONE")
    print(len(y_train))

    return to_df(x_train, y_train), to_df(x_valid, y_valid), to_df(x_test, y_test)

In [None]:
import numpy as np
import pandas as pd

tname_data = "/content/drive/MyDrive/TP/hsd_merge_cleaned_lowered"
data = pd.read_csv(f"{tname_data}.csv")

train, valid, test = split_3_aug(data)

X_train = train['text']
y_train = train['label']

X_valid = valid['text']
y_valid = valid['label']

X_test = test['text']
y_test = test['label']

(3362,)
DONE
3362
(13443,)
DONE
13443


In [None]:
# temp = "/content/drive/MyDrive/TP1-AntiDiscriminator/datasets/final/augmented/hsd_merge_aug_train.csv"
# train.to_csv(temp,  sep=",", index=False)

# temp = "/content/drive/MyDrive/TP1-AntiDiscriminator/datasets/final/augmented/hsd_merge_aug_valid.csv"
# valid.to_csv(temp,  sep=",", index=False)

# temp = "/content/drive/MyDrive/TP1-AntiDiscriminator/datasets/final/augmented/hsd_merge_aug_test.csv"
# test.to_csv(temp,  sep=",", index=False)

In [None]:
data[data["label"] == 1]

Unnamed: 0,text,label
0,pillaron al presunto es de nacionalidad colomb...,1
1,cuando se den cuenta que eran homicidas o narc...,1
2,jajaajja ahí estuviste bien chouza se nota tu ...,1
3,acá podemos ver a un alemán de cabello rubio o...,1
4,chile murieron 8 compatriotas por culpa de un ...,1
...,...,...
17778,marinovic hay alguna mujer del movimiento femi...,1
17782,leí tu descripción ahora entiendo todo el daño...,1
17816,🤮🤮🤮🤮una 💩 de debate solo orrego se presta para...,1
17822,los delincuentes de la primers linea queman ig...,1


In [None]:
train[train["label"] == 1]

Unnamed: 0,text,label
4,para pagar pensiones 4 detenidos por introduci...,1
5,apra apgar pensiones 4 detenidos por introduci...,1
11,que se vaya ya el puto negro de mierda,1
12,que se v aya ya el puto negro de mierda,1
22,que bien haces de maricon ¿por qué será homose...,1
...,...,...
13419,los menas pueden estar en la calle a la hora q...,1
13420,que me llamen racista pero esta gentuza deberí...,1
13421,que me llamen racista pero e8ta gentuza deberí...,1
13436,totalmente de acuerdo muchos de los emigrantes...,1


In [None]:
valid[valid["label"] == 1]

Unnamed: 0,text,label
2,molecular pero si los de derecha son gente re ...,1
3,molecular pero si los de derecha son gente re ...,1
10,maricon como insulto un saludo,1
11,Nmaricon como insulto un saludo,1
14,¡vallekanos yonkis y gitanos orgullosos de nue...,1
...,...,...
3346,yvnalesca tu si que no te salvas vcuando te en...,1
3353,el que tiene que parar el juego es el árbitro ...,1
3354,el que tiene que parar el juego es el árbitro ...,1
3356,un subnormal menos en twitter grande irene,1


In [None]:
test[test["label"] == 1]

Unnamed: 0,text,label
11451,que le jodan al mongolo de los huevos,1
5514,y digo yo ¿no sera la dimisión de o del entero...,1
15747,tienen para 140 000 € y pactar 100 plazas de j...,1
8352,hay un puto subnormal pitando en mi calle todo...,1
16499,tomasroncero estas jodido eh puto gordo maricon,1
...,...,...
12964,ok puto moro la proxima trae un contrato de in...,1
12393,aunque tu foto de perfil parece un marica el d...,1
5811,maigi si algo he aprendido es que el que tenga...,1
1706,tu prima es guarrilla,1


In [None]:

# import numpy as np
# import pandas as pd

# tname_data = "/content/drive/MyDrive/TP1-AntiDiscriminator/datasets/original/dataset_9834_cleaned_lowered"
# data = pd.read_csv(f"{tname_data}.csv")
# # test = pd.read_csv(f"{tname_data}_test.csv")

# data = data.sample(frac=1).reset_index()
# data = data[["text", "label"]]

# train = data[: int(len(data) * 0.7)]
# valid = data[int(len(data) * 0.7) : int(len(data) * 0.9)]
# test = data[int(len(data) * 0.9) :]

# X_train = train['text']
# y_train = train['label']

# X_valid = valid['text']
# y_valid = valid['label']

# X_test = test['text']
# y_test = test['label']

In [None]:

# import numpy as np
# import pandas as pd

# tname_data = "hsd_merged_AUG"

# data = pd.read_csv(f"{tname_data}_no_test.csv")
# test = pd.read_csv(f"{tname_data}_test.csv")


# train = data[: int(len(data) * 0.9)]
# valid = data[int(len(data) * 0.9) :]

# X_train = train['text']
# y_train = train['label']

# X_valid = valid['text']
# y_valid = valid['label']

# X_test = test['text']
# y_test = test['label']


## Set Cuda

In [None]:
# import torch
# from torchtext import data
# from torchtext import datasets
# import random
# import numpy as np

# SEED = 1234

# random.seed(SEED)
# np.random.seed(SEED)
# torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

# torch.cuda.is_available()

# Extract feature by using BETO

In [None]:
#train_sentences

In [None]:
import pandas as pd
from glob import glob

train_sentences = list(train['text'].values)
train_labels = list(train['label'].values)

valid_sentences = list(valid['text'].values)
valid_labels = list(valid['label'].values)

test_sentences = list(test['text'].values)
test_labels = list(test['label'].values)

In [None]:
test_labelsdf = pd.DataFrame(test_labels)
test_labelsdf.to_csv('test_labels.csv',index=False)

Load tokenizer of BETO

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [None]:
tokenizer.vocab_size

31002

In [None]:
#choose max_length for beto model based on the input length

max_length = 0
list_len=[]
for sentence in train_sentences:
    length = len(tokenizer.tokenize(sentence))
    list_len.append(length)

from collections import Counter
Counter(list_len).most_common(100)

[(17, 451),
 (12, 413),
 (21, 412),
 (18, 406),
 (23, 400),
 (16, 393),
 (13, 387),
 (15, 385),
 (24, 384),
 (22, 380),
 (14, 379),
 (26, 361),
 (25, 359),
 (19, 356),
 (27, 348),
 (20, 347),
 (10, 343),
 (11, 339),
 (9, 321),
 (28, 299),
 (29, 294),
 (30, 271),
 (32, 266),
 (33, 246),
 (31, 241),
 (8, 224),
 (7, 212),
 (34, 195),
 (36, 186),
 (35, 163),
 (6, 150),
 (37, 147),
 (40, 146),
 (53, 139),
 (39, 138),
 (41, 131),
 (38, 130),
 (44, 128),
 (49, 126),
 (45, 122),
 (56, 121),
 (42, 120),
 (47, 119),
 (5, 118),
 (50, 117),
 (51, 115),
 (54, 114),
 (48, 113),
 (57, 111),
 (46, 111),
 (43, 107),
 (52, 105),
 (55, 100),
 (59, 90),
 (58, 81),
 (60, 74),
 (62, 67),
 (4, 67),
 (61, 61),
 (64, 59),
 (66, 44),
 (63, 43),
 (65, 36),
 (3, 33),
 (68, 31),
 (69, 30),
 (67, 28),
 (70, 19),
 (72, 12),
 (76, 11),
 (71, 10),
 (2, 10),
 (74, 8),
 (73, 7),
 (80, 5),
 (75, 5),
 (77, 4),
 (79, 4),
 (1, 4),
 (81, 4),
 (78, 3),
 (85, 2),
 (83, 2)]

In [None]:
# Encode train label

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_labels)
encoded_train_labels = le.transform(train_labels)
encoded_valid_labels = le.transform(valid_labels)
encoded_test_labels = le.transform(test_labels)

In [None]:
# Tokens IDs tensor

def encoder_generator(sentences,labels):

    sent_index = []
    input_ids = []
    attention_masks =[]

    for index,sent in enumerate(sentences):

        sent_index.append(index)

        encoded_dict = tokenizer.encode_plus(sent,
                                             add_special_tokens=True,
                                             max_length=50,
                                             padding='max_length',
                                             truncation = True,
                                             return_attention_mask=True,
                                             return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids,dim=0).cuda()
    attention_masks = torch.cat(attention_masks,dim=0).cuda()
    labels = torch.tensor(labels).cuda()
    sent_index = torch.tensor(sent_index).cuda()

    return sent_index,input_ids,attention_masks,labels

train_sent_index,train_input_ids,train_attention_masks,train_encoded_label_tensors = encoder_generator(train_sentences,encoded_train_labels)
valid_sent_index,valid_input_ids,valid_attention_masks,valid_encoded_label_tensors = encoder_generator(valid_sentences,encoded_valid_labels)
print('Original: ', train_sentences[0])
print('Token IDs:', train_input_ids[0])

Original:  pero es que tu eres la oveja negra y tu solito te lías
Token IDs: tensor([    4,  1355,  1058,  1038,  1294,  2149,  1030, 27836,  9715,  1042,
         1294,  1505,  1806,  1240,  2836,  1021,     5,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
       device='cuda:0')


In [None]:
# Connvert train, dev input by using TensorDataset

from torch.utils.data import TensorDataset,random_split

train_dataset = TensorDataset(train_input_ids,train_attention_masks,train_encoded_label_tensors)
valid_dataset = TensorDataset(valid_input_ids,valid_attention_masks,valid_encoded_label_tensors)

print('train data samples is {}'.format(len(train_dataset)))
print("valid data samples is {}".format(len(valid_dataset)))

train data samples is 13443
valid data samples is 3362


In [None]:
# Set cuda by using device

from torch.utils.data import DataLoader,RandomSampler,SequentialSampler

bs=128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data_loader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=bs)
valid_data_loader = DataLoader(valid_dataset,
                              sampler=RandomSampler(valid_dataset),
                              batch_size=bs)

Load model BETO

In [None]:
from transformers import AutoModel

beto = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
beto = beto.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Build CNN

In [None]:
# Build class CNN

import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel

class CNN(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim,
                 dropout, pad_idx):

        super().__init__()

        self.fc_input = nn.Linear(embedding_dim,embedding_dim)

        self.conv_0 = nn.Conv1d(in_channels = embedding_dim,
                                out_channels = n_filters,
                                kernel_size = filter_sizes[0])

        self.conv_1 = nn.Conv1d(in_channels = embedding_dim,
                                out_channels = n_filters,
                                kernel_size = filter_sizes[1])

        self.conv_2 = nn.Conv1d(in_channels = embedding_dim,
                                out_channels = n_filters,
                                kernel_size = filter_sizes[2])

        self.conv_3 = nn.Conv1d(in_channels = embedding_dim,
                                out_channels = n_filters,
                                kernel_size = filter_sizes[3])

        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, encoded):

        #embedded = [batch size, sent len, emb dim]
        embedded = self.fc_input(encoded)
        #print(embedded.shape)

        embedded = embedded.permute(0, 2, 1)
        #print(embedded.shape)

        #embedded = [batch size, emb dim, sent len]

        conved_0 = F.relu(self.conv_0(embedded))
        conved_1 = F.relu(self.conv_1(embedded))
        conved_2 = F.relu(self.conv_2(embedded))
        conved_3 = F.relu(self.conv_3(embedded))

        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]

        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        pooled_3 = F.max_pool1d(conved_3, conved_3.shape[2]).squeeze(2)

        #pooled_n = [batch size, n_fibatlters]

        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2, pooled_3), dim = 1).cuda())

        #cat = [batch size, n_filters * len(filter_sizes)]

        result =  self.fc(cat)

        #print(result.shape)

        return result

In [None]:
# Hyperparameters

EMBEDDING_DIM = 768
N_FILTERS = 32
FILTER_SIZES = [1,2,3,5]
OUTPUT_DIM = len(le.classes_)
DROPOUT = 0.2
PAD_IDX = tokenizer.pad_token_id

cnn = CNN(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
cnn = cnn.to(device)

In [None]:
cnn

CNN(
  (fc_input): Linear(in_features=768, out_features=768, bias=True)
  (conv_0): Conv1d(768, 32, kernel_size=(1,), stride=(1,))
  (conv_1): Conv1d(768, 32, kernel_size=(2,), stride=(1,))
  (conv_2): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  (conv_3): Conv1d(768, 32, kernel_size=(5,), stride=(1,))
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [None]:
0.00002 == 2e-5

True

In [None]:
# Optimizer and criterion

import torch.optim as optim

model_parameters = list(beto.parameters())+list(cnn.parameters())

optimizer = optim.Adam(model_parameters,lr=2e-5,eps=1e-8)
# optimizer = optim.Adam(model_parameters,lr=1e-3,eps=1e-8)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [None]:
# Calculate accuracy per batch during train

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).cuda()

In [None]:
# Def for training

from tqdm import tqdm

def train():

    epoch_loss = 0
    epoch_acc = 0

    beto.train()
    cnn.train()

    for batch in tqdm(train_data_loader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()

        embedded = beto(b_input_ids,b_input_mask)[0]

        predictions = cnn(embedded)

        loss = criterion(predictions, b_labels)

        acc = categorical_accuracy(predictions, b_labels)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(train_data_loader), epoch_acc / len(train_data_loader)

In [None]:
# Class for predict label

import numpy as np

def predictions_labels(preds,labels):
    pred = np.argmax(preds,axis=1).flatten()
    label = labels.flatten()
    return pred,label

In [None]:
# Evaluate loss, acc  and f1-macro

from sklearn.metrics import classification_report,accuracy_score,f1_score
def eval():
    epoch_loss = 0

    total_predictions = []
    total_true = []

    all_true_labels = []
    all_pred_labels = []

    beto.eval()
    cnn.eval()

    with torch.no_grad():

        for batch in tqdm(valid_data_loader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            embedded = beto(b_input_ids,b_input_mask)[0]
            predictions = cnn(embedded)

            loss = criterion(predictions, b_labels)
            epoch_loss += loss.item()

            predictions = predictions.detach().cpu().numpy()

            label_ids = b_labels.to('cpu').numpy()

            pred,true = predictions_labels(predictions,label_ids)

            all_pred_labels.extend(pred)
            all_true_labels.extend(true)

    print(classification_report(all_pred_labels,all_true_labels))
    avg_val_accuracy = accuracy_score(all_pred_labels,all_true_labels)
    macro_f1_score = f1_score(all_pred_labels,all_true_labels,average='macro')

    avg_val_loss = epoch_loss/len(valid_data_loader)

    print("accuracy = {0:.2f}".format(avg_val_accuracy))

    return avg_val_loss,avg_val_accuracy,macro_f1_score

In [None]:
# Time for training

import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# # Set device and gpu

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

# beto.cuda()

# Training

In [None]:
tname = "/content/beto_cnn_model_home"

In [None]:
 epochs = 20
# epochs = 10
# epochs = 5

best_macro_f1 = float('0')

for epoch in range(epochs):

    start_time = time.time()
    train_loss,train_acc = train()
    valid_loss,valid_acc,macro_f1 = eval()
    end_time = time.time()


    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(beto,f'{tname}_part1_train.pt')
        torch.save(cnn,f'{tname}_part2_train.pt')
        print("MODEL SAVED")

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. acc: {valid_acc*100:.2f}%')
    print(f'\t Val. F1: {macro_f1*100:.2f}%')
    print('=============Epoch Ended==============\n')

100%|██████████| 106/106 [01:51<00:00,  1.05s/it]
100%|██████████| 27/27 [00:09<00:00,  2.92it/s]


              precision    recall  f1-score   support

           0       0.91      0.87      0.89      2446
           1       0.70      0.78      0.73       916

    accuracy                           0.85      3362
   macro avg       0.81      0.83      0.81      3362
weighted avg       0.85      0.85      0.85      3362

accuracy = 0.85
MODEL SAVED
Epoch: 01 | Epoch Time: 2m 0s
	Train Loss: 0.445 | Train acc: 79.42%
	 Val. Loss: 0.344 |  Val. acc: 84.77%
	 Val. F1: 81.41%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.92it/s]


              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2666
           1       0.60      0.88      0.72       696

    accuracy                           0.86      3362
   macro avg       0.78      0.86      0.81      3362
weighted avg       0.89      0.86      0.86      3362

accuracy = 0.86
Epoch: 02 | Epoch Time: 1m 56s
	Train Loss: 0.274 | Train acc: 88.97%
	 Val. Loss: 0.344 |  Val. acc: 85.54%
	 Val. F1: 80.96%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.91it/s]


              precision    recall  f1-score   support

           0       0.90      0.90      0.90      2361
           1       0.76      0.78      0.77      1001

    accuracy                           0.86      3362
   macro avg       0.83      0.84      0.84      3362
weighted avg       0.86      0.86      0.86      3362

accuracy = 0.86
MODEL SAVED
Epoch: 03 | Epoch Time: 1m 56s
	Train Loss: 0.183 | Train acc: 92.89%
	 Val. Loss: 0.338 |  Val. acc: 86.23%
	 Val. F1: 83.60%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.88it/s]


              precision    recall  f1-score   support

           0       0.97      0.86      0.91      2648
           1       0.62      0.89      0.73       714

    accuracy                           0.86      3362
   macro avg       0.80      0.87      0.82      3362
weighted avg       0.89      0.86      0.87      3362

accuracy = 0.86
Epoch: 04 | Epoch Time: 1m 57s
	Train Loss: 0.104 | Train acc: 96.34%
	 Val. Loss: 0.455 |  Val. acc: 86.32%
	 Val. F1: 82.10%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.90it/s]


              precision    recall  f1-score   support

           0       0.76      0.94      0.84      1911
           1       0.88      0.62      0.72      1451

    accuracy                           0.80      3362
   macro avg       0.82      0.78      0.78      3362
weighted avg       0.81      0.80      0.79      3362

accuracy = 0.80
Epoch: 05 | Epoch Time: 1m 56s
	Train Loss: 0.072 | Train acc: 97.66%
	 Val. Loss: 0.683 |  Val. acc: 79.80%
	 Val. F1: 78.26%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.92it/s]


              precision    recall  f1-score   support

           0       0.93      0.88      0.91      2495
           1       0.70      0.82      0.76       867

    accuracy                           0.86      3362
   macro avg       0.82      0.85      0.83      3362
weighted avg       0.87      0.86      0.87      3362

accuracy = 0.86
Epoch: 06 | Epoch Time: 1m 56s
	Train Loss: 0.111 | Train acc: 95.87%
	 Val. Loss: 0.509 |  Val. acc: 86.41%
	 Val. F1: 83.15%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.92it/s]


              precision    recall  f1-score   support

           0       0.94      0.88      0.91      2523
           1       0.69      0.84      0.76       839

    accuracy                           0.87      3362
   macro avg       0.82      0.86      0.84      3362
weighted avg       0.88      0.87      0.87      3362

accuracy = 0.87
Epoch: 07 | Epoch Time: 1m 56s
	Train Loss: 0.025 | Train acc: 99.34%
	 Val. Loss: 0.599 |  Val. acc: 86.82%
	 Val. F1: 83.51%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.90it/s]


              precision    recall  f1-score   support

           0       0.95      0.87      0.91      2568
           1       0.67      0.86      0.75       794

    accuracy                           0.87      3362
   macro avg       0.81      0.86      0.83      3362
weighted avg       0.89      0.87      0.87      3362

accuracy = 0.87
Epoch: 08 | Epoch Time: 1m 56s
	Train Loss: 0.024 | Train acc: 99.30%
	 Val. Loss: 0.627 |  Val. acc: 86.67%
	 Val. F1: 83.07%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.93it/s]


              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2409
           1       0.74      0.79      0.76       953

    accuracy                           0.86      3362
   macro avg       0.83      0.84      0.83      3362
weighted avg       0.86      0.86      0.86      3362

accuracy = 0.86
Epoch: 09 | Epoch Time: 1m 56s
	Train Loss: 0.015 | Train acc: 99.57%
	 Val. Loss: 0.662 |  Val. acc: 86.11%
	 Val. F1: 83.23%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.91it/s]


              precision    recall  f1-score   support

           0       0.92      0.89      0.91      2414
           1       0.75      0.80      0.77       948

    accuracy                           0.87      3362
   macro avg       0.83      0.85      0.84      3362
weighted avg       0.87      0.87      0.87      3362

accuracy = 0.87
MODEL SAVED
Epoch: 10 | Epoch Time: 1m 56s
	Train Loss: 0.024 | Train acc: 99.13%
	 Val. Loss: 0.603 |  Val. acc: 86.67%
	 Val. F1: 83.89%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.89it/s]


              precision    recall  f1-score   support

           0       0.95      0.87      0.91      2559
           1       0.68      0.86      0.76       803

    accuracy                           0.87      3362
   macro avg       0.81      0.86      0.83      3362
weighted avg       0.89      0.87      0.87      3362

accuracy = 0.87
Epoch: 11 | Epoch Time: 1m 56s
	Train Loss: 0.017 | Train acc: 99.48%
	 Val. Loss: 0.679 |  Val. acc: 86.82%
	 Val. F1: 83.31%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.88it/s]


              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2670
           1       0.60      0.88      0.71       692

    accuracy                           0.85      3362
   macro avg       0.78      0.86      0.81      3362
weighted avg       0.89      0.85      0.86      3362

accuracy = 0.85
Epoch: 12 | Epoch Time: 1m 56s
	Train Loss: 0.009 | Train acc: 99.71%
	 Val. Loss: 0.801 |  Val. acc: 85.25%
	 Val. F1: 80.54%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.89it/s]


              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2398
           1       0.74      0.78      0.76       964

    accuracy                           0.86      3362
   macro avg       0.83      0.84      0.83      3362
weighted avg       0.86      0.86      0.86      3362

accuracy = 0.86
Epoch: 13 | Epoch Time: 1m 56s
	Train Loss: 0.009 | Train acc: 99.72%
	 Val. Loss: 0.691 |  Val. acc: 86.02%
	 Val. F1: 83.18%



100%|██████████| 106/106 [01:46<00:00,  1.00s/it]
100%|██████████| 27/27 [00:09<00:00,  2.91it/s]


              precision    recall  f1-score   support

           0       0.94      0.87      0.90      2528
           1       0.68      0.83      0.75       834

    accuracy                           0.86      3362
   macro avg       0.81      0.85      0.83      3362
weighted avg       0.87      0.86      0.87      3362

accuracy = 0.86
Epoch: 14 | Epoch Time: 1m 55s
	Train Loss: 0.009 | Train acc: 99.71%
	 Val. Loss: 0.764 |  Val. acc: 86.08%
	 Val. F1: 82.55%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.91it/s]


              precision    recall  f1-score   support

           0       0.94      0.87      0.90      2546
           1       0.67      0.84      0.74       816

    accuracy                           0.86      3362
   macro avg       0.81      0.85      0.82      3362
weighted avg       0.88      0.86      0.87      3362

accuracy = 0.86
Epoch: 15 | Epoch Time: 1m 56s
	Train Loss: 0.013 | Train acc: 99.54%
	 Val. Loss: 0.764 |  Val. acc: 86.08%
	 Val. F1: 82.44%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.88it/s]


              precision    recall  f1-score   support

           0       0.98      0.83      0.90      2768
           1       0.53      0.91      0.67       594

    accuracy                           0.84      3362
   macro avg       0.76      0.87      0.79      3362
weighted avg       0.90      0.84      0.86      3362

accuracy = 0.84
Epoch: 16 | Epoch Time: 1m 55s
	Train Loss: 0.010 | Train acc: 99.68%
	 Val. Loss: 1.098 |  Val. acc: 84.35%
	 Val. F1: 78.52%



100%|██████████| 106/106 [01:46<00:00,  1.00s/it]
100%|██████████| 27/27 [00:09<00:00,  2.91it/s]


              precision    recall  f1-score   support

           0       0.96      0.85      0.91      2650
           1       0.62      0.88      0.73       712

    accuracy                           0.86      3362
   macro avg       0.79      0.87      0.82      3362
weighted avg       0.89      0.86      0.87      3362

accuracy = 0.86
Epoch: 17 | Epoch Time: 1m 55s
	Train Loss: 0.016 | Train acc: 99.45%
	 Val. Loss: 0.905 |  Val. acc: 86.02%
	 Val. F1: 81.70%



100%|██████████| 106/106 [01:47<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.89it/s]


              precision    recall  f1-score   support

           0       0.97      0.85      0.90      2687
           1       0.59      0.89      0.71       675

    accuracy                           0.86      3362
   macro avg       0.78      0.87      0.81      3362
weighted avg       0.89      0.86      0.86      3362

accuracy = 0.86
Epoch: 18 | Epoch Time: 1m 56s
	Train Loss: 0.010 | Train acc: 99.68%
	 Val. Loss: 0.946 |  Val. acc: 85.51%
	 Val. F1: 80.76%



100%|██████████| 106/106 [01:46<00:00,  1.00s/it]
100%|██████████| 27/27 [00:09<00:00,  2.90it/s]


              precision    recall  f1-score   support

           0       0.94      0.87      0.90      2525
           1       0.67      0.82      0.74       837

    accuracy                           0.86      3362
   macro avg       0.80      0.84      0.82      3362
weighted avg       0.87      0.86      0.86      3362

accuracy = 0.86
Epoch: 19 | Epoch Time: 1m 55s
	Train Loss: 0.011 | Train acc: 99.59%
	 Val. Loss: 0.773 |  Val. acc: 85.63%
	 Val. F1: 82.01%



100%|██████████| 106/106 [01:46<00:00,  1.01s/it]
100%|██████████| 27/27 [00:09<00:00,  2.91it/s]

              precision    recall  f1-score   support

           0       0.93      0.88      0.91      2495
           1       0.70      0.82      0.76       867

    accuracy                           0.86      3362
   macro avg       0.82      0.85      0.83      3362
weighted avg       0.87      0.86      0.87      3362

accuracy = 0.86
Epoch: 20 | Epoch Time: 1m 55s
	Train Loss: 0.007 | Train acc: 99.73%
	 Val. Loss: 0.727 |  Val. acc: 86.47%
	 Val. F1: 83.22%






In [None]:
# Save BETO and CNN

torch.save(beto, tname + '_module_part1.pt')
torch.save(cnn,  tname + '_module_part2.pt')

In [None]:
# # Save BETO and CNN
# tname = "/content/drive/MyDrive/TP1-AntiDiscriminator/weights/beto_cnn"
# torch.save(beto, tname + '_part1_train.pt')
# torch.save(cnn,  tname + '_part2_train.pt')

## EVALUATING

In [None]:
# Load phobert and cnn

import torch
beto = torch.load(f'{tname}_part1_train.pt')
cnn = torch.load(f'{tname}_part2_train.pt')
beto.eval()
cnn.eval()


CNN(
  (fc_input): Linear(in_features=768, out_features=768, bias=True)
  (conv_0): Conv1d(768, 32, kernel_size=(1,), stride=(1,))
  (conv_1): Conv1d(768, 32, kernel_size=(2,), stride=(1,))
  (conv_2): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  (conv_3): Conv1d(768, 32, kernel_size=(5,), stride=(1,))
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

Predict label from true label

In [None]:
test_sent_index, test_input_ids, test_attention_masks, test_encoded_label_tensors = encoder_generator(test_sentences,encoded_test_labels)
test_dataset = TensorDataset(test_input_ids,test_attention_masks,test_encoded_label_tensors)

test_data_loader = DataLoader(test_dataset,
                              sampler=RandomSampler(test_dataset),
                              batch_size=bs)

all_pred_labels = []
all_true_labels = []

with torch.no_grad():
  for batch in tqdm(test_data_loader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    embedded = beto(b_input_ids,b_input_mask)[0]
    predictions = cnn(embedded)


    predictions = predictions.detach().cpu().numpy()

    label_ids = b_labels.to('cpu').numpy()

    pred, true = predictions_labels(predictions, label_ids)

    all_pred_labels.extend(pred)
    all_true_labels.extend(true)

100%|██████████| 28/28 [00:09<00:00,  2.88it/s]


In [None]:
# The final score in the test set (classification report)

#print(classification_report(all_pred_labels,all_true_labels, digits = 4))
print(classification_report(all_true_labels,all_pred_labels, digits = 4))

              precision    recall  f1-score   support

           0     0.9151    0.9042    0.9096      2932
           1     0.5806    0.6126    0.5962       635

    accuracy                         0.8523      3567
   macro avg     0.7478    0.7584    0.7529      3567
weighted avg     0.8555    0.8523    0.8538      3567



In [None]:
# Confusion matrix in the test set

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_true_labels, all_pred_labels)
cm

array([[2651,  281],
       [ 246,  389]])