In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install transformers -qq
# !pip install sentencepiece -qq
# !pip install tokenizer -qq
!pip install nlpaug

## Set Cuda

In [None]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

torch.cuda.is_available()

True

##Load Dataset

In [None]:
from sklearn.model_selection import train_test_split

def to_df(x, y):
    d = {"text": x, "label": y}
    return pd.DataFrame(d)

def split_3(df, test_size=0.2, valid_size=0.2):
    _df = df.copy().sample(frac=1).reset_index()
    _df = _df[["text", "label"]]

    x = df["text"].copy()
    y = df["label"].copy()
    #split train-test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=y, random_state=SEED)
    # split train-valid
    x, y = x_train, y_train
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=valid_size, stratify=y, random_state=SEED)
    return to_df(x_train, y_train), to_df(x_valid, y_valid), to_df(x_test, y_test)

In [None]:
import random
from itertools import chain
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
from nlpaug.util import Action


alpha_common_error = 0.10
alpha_common_error_char = 0.05
aug1_OCR = nac.OcrAug(aug_word_p=alpha_common_error)
aug2_Rins = nac.RandomCharAug(action="insert", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug3_Rsub = nac.RandomCharAug(action="substitute", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug4_Rswa = nac.RandomCharAug(action="swap", aug_word_p=alpha_common_error,aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char) #
aug5_Rdel = nac.RandomCharAug(action="delete", aug_word_p=alpha_common_error, aug_char_min=1, aug_char_max=1, aug_char_p=alpha_common_error_char)
aug6_Kb = nac.KeyboardAug(aug_word_p=alpha_common_error)
aug7_Split = naw.SplitAug(aug_p=alpha_common_error)


def text2augment(text, m):
    output = [text, ]

    temp = random.sample(range(0, 7), m - 1)

    if 0 in temp:
        output.append( *aug1_OCR.augment(text))
    if 1 in temp:
        output.append( *aug2_Rins.augment(text))
    if 2 in temp:
        output.append( *aug3_Rsub.augment(text))
    if 3 in temp:
        output.append( *aug4_Rswa.augment(text))
    if 4 in temp:
        output.append( *aug5_Rdel.augment(text))
    if 5 in temp:
        output.append( *aug6_Kb.augment(text))
    if 6 in temp:
        output.append( *aug7_Split.augment(text))

    return output


def aug_replicate(y_labels):
    return list(chain(* [[y]*(2 if y == 0 else 4) for y in y_labels] ))

def aug_text(x_text, y_labels):
    x_text = [ text2augment(x, 2 if y == 0 else 4) for x, y in zip(x_text, y_labels)]
    return pd.Series(list(chain(*x_text)), index=None)

def split_3_aug(df, test_size=0.2, valid_size=0.2):
    _df = df.copy().sample(frac=1).reset_index()
    _df = _df[["text", "label"]]

    x = _df["text"].copy()
    y = _df["label"].copy()
    #split train-test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=y, random_state=SEED)
    # augment
    # x_test = aug_text(x_test, y_test)
    # y_test = aug_replicate(y_test)
    # split train-valid
    x, y = x_train, y_train
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=valid_size, stratify=y, random_state=SEED)
    # augment
    x_train = aug_text(x_train, y_train)
    y_train = aug_replicate(y_train)
    x_valid = aug_text(x_valid, y_valid)
    y_valid = aug_replicate(y_valid)

    print(x_valid.shape)
    print("DONE")
    print(len(y_valid))

    print(x_train.shape)
    print("DONE")
    print(len(y_train))

    return to_df(x_train, y_train), to_df(x_valid, y_valid), to_df(x_test, y_test)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import numpy as np
import pandas as pd

tname_data = "./hsd_merge_cleaned_lowered"
data = pd.read_csv(f"{tname_data}.csv")

train, valid, test = split_3_aug(data)

X_train = train['text']
y_train = train['label']

X_valid = valid['text']
y_valid = valid['label']

X_test = test['text']
y_test = test['label']

(6724,)
DONE
6724
(26886,)
DONE
26886


# Extract feature by using BETO

In [None]:
import pandas as pd
from glob import glob

train_sentences = list(train['text'].values)
train_labels = list(train['label'].values)

valid_sentences = list(valid['text'].values)
valid_labels = list(valid['label'].values)

test_sentences = list(test['text'].values)
test_labels = list(test['label'].values)

Load tokenizer of BETO

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

In [None]:
# #choose max_length for phobert model based on the input length

# max_length = 0
# list_len=[]
# for sentence in train_sentences:
#     length = len(tokenizer.tokenize(sentence))
#     list_len.append(length)

# from collections import Counter
# Counter(list_len).most_common(100)

[(23, 827),
 (21, 773),
 (22, 770),
 (24, 756),
 (17, 744),
 (25, 737),
 (15, 728),
 (20, 720),
 (26, 705),
 (19, 704),
 (12, 703),
 (18, 700),
 (11, 694),
 (16, 689),
 (14, 670),
 (13, 666),
 (27, 639),
 (29, 619),
 (28, 618),
 (10, 594),
 (31, 585),
 (9, 576),
 (30, 548),
 (33, 545),
 (32, 510),
 (34, 483),
 (36, 386),
 (37, 378),
 (8, 378),
 (35, 372),
 (7, 356),
 (39, 323),
 (38, 307),
 (41, 286),
 (42, 269),
 (40, 264),
 (43, 242),
 (6, 239),
 (49, 238),
 (46, 234),
 (53, 234),
 (44, 233),
 (47, 231),
 (51, 231),
 (50, 230),
 (45, 224),
 (48, 217),
 (56, 214),
 (54, 211),
 (52, 208),
 (57, 208),
 (55, 198),
 (58, 193),
 (60, 183),
 (59, 181),
 (5, 177),
 (61, 175),
 (62, 159),
 (66, 143),
 (64, 142),
 (65, 140),
 (63, 132),
 (67, 118),
 (68, 114),
 (69, 108),
 (4, 97),
 (72, 77),
 (70, 72),
 (71, 54),
 (73, 54),
 (74, 46),
 (3, 45),
 (76, 44),
 (75, 43),
 (77, 27),
 (79, 19),
 (80, 19),
 (78, 19),
 (81, 17),
 (82, 12),
 (2, 12),
 (83, 8),
 (89, 6),
 (87, 6),
 (85, 5),
 (1, 4),
 (8

In [None]:
# Encode train label

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_labels)
encoded_labels = le.transform(train_labels)
encoded_test_labels = le.transform(valid_labels)

In [None]:
# Tokens IDs tensor

def encoder_generator(sentences,labels):

    sent_index = []
    input_ids = []
    attention_masks =[]

    for index,sent in enumerate(sentences):

        sent_index.append(index)

        encoded_dict = tokenizer.encode_plus(sent,
                                             add_special_tokens=True,
                                             max_length=50,
                                             pad_to_max_length=True,
                                             truncation = True,
                                             return_attention_mask=True,
                                             return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids,dim=0).cuda()
    attention_masks = torch.cat(attention_masks,dim=0).cuda()
    labels = torch.tensor(labels).cuda()
    sent_index = torch.tensor(sent_index).cuda()

    return sent_index,input_ids,attention_masks,labels

train_sent_index,train_input_ids,train_attention_masks,train_encoded_label_tensors = encoder_generator(train_sentences,encoded_labels)
valid_sent_index,valid_input_ids,valid_attention_masks,valid_encoded_label_tensors = encoder_generator(valid_sentences,encoded_test_labels)
print('Original: ', train_sentences[0])
print('Token IDs:', train_input_ids[0])



Original:  ojalá cachen los comunistas de cartón de chile como son los comunistas de verdad para q dejen de andar haciendo el loco
Token IDs: tensor([    4, 29596, 14676,  1014,  1065, 18411,  1008, 23638,  1008,  9899,
        30931,  1184,  1404,  1065, 18411,  1008,  1836,  1110,  1033, 13812,
         1008, 12329,  2391,  1040,  4478,     5,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
       device='cuda:0')


In [None]:
# Connvert train, dev input by using TensorDataset

from torch.utils.data import TensorDataset,random_split

train_dataset = TensorDataset(train_input_ids,train_attention_masks,train_encoded_label_tensors)
valid_dataset = TensorDataset(valid_input_ids,valid_attention_masks,valid_encoded_label_tensors)

print('train data samples is {}'.format(len(train_dataset)))
print("valid data samples is {}".format(len(valid_dataset)))

train data samples is 26886
valid data samples is 6724


In [None]:
# Set cuda by using device

from torch.utils.data import DataLoader,RandomSampler,SequentialSampler

bs=128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data_loader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=bs)
valid_data_loader = DataLoader(valid_dataset,
                              sampler=RandomSampler(valid_dataset),
                              batch_size=bs)

Load model BETO

In [None]:
from transformers import AutoModel

beto = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
beto = beto.to(device)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Build CNN

In [None]:
import torch
import torch.nn as nn

import torch
import torch.nn as nn

class CNNForNLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, num_filters, filter_sizes):
        super(CNNForNLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, filter_size)
            for filter_size in filter_sizes
        ])
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)

    def forward(self, x, _):
        embedded = self.embedding(x)  # x: (batch_size, sequence_length)
        embedded = embedded.permute(0, 2, 1)  # embedded: (batch_size, embedding_dim, sequence_length)
        feature_maps = []
        for conv in self.convs:
            feature_map = torch.relu(conv(embedded))  # feature_map: (batch_size, num_filters, H)
            pooled = torch.max(feature_map, dim=2)[0]  # pooled: (batch_size, num_filters)
            feature_maps.append(pooled)
        combined = torch.cat(feature_maps, dim=1)  # combined: (batch_size, len(filter_sizes) * num_filters)
        combined = self.dropout(combined)
        logits = self.fc(combined)  # logits: (batch_size, num_classes)
        return logits


In [None]:

# Definir los parámetros del modelo
vocab_size = 31002#tamaño del vocabulario
embedding_dim = 768 #Dimension de los vectores de embedding
num_classes = 2 #numero de clases o categorias de clasificacion
num_filters = 32  #numero de filtros convolucionales
filter_sizes = [3]  #tamaño de los filtros convolucionales


CNNmodel = CNNForNLP(vocab_size,embedding_dim,num_classes,num_filters,filter_sizes)


In [None]:

# Imprimir el modelo
print(CNNmodel)

CNNForNLP(
  (embedding): Embedding(31002, 768)
  (convs): ModuleList(
    (0): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)


In [None]:
# Optimizer and criterion

import torch.optim as optim

model_parameters = list(CNNmodel.parameters())

optimizer = optim.Adam(model_parameters,lr=2e-5,eps=1e-8)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [None]:
# Calculate accuracy per batch during train

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).cuda()

In [None]:
# Def for training

from tqdm import tqdm

def train(model):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(train_data_loader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()

        predictions = model(b_input_ids,b_input_mask)

        loss = criterion(predictions, b_labels)

        acc = categorical_accuracy(predictions, b_labels)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(train_data_loader), epoch_acc / len(train_data_loader)

In [None]:
# Class for predict label

import numpy as np

def predictions_labels(preds,labels):
    pred = np.argmax(preds,axis=1).flatten()
    label = labels.flatten()
    return pred,label

In [None]:
# Evaluate loss, acc  and f1-macro

from sklearn.metrics import classification_report,accuracy_score,f1_score
def eval(model):
    epoch_loss = 0

    total_predictions = []
    total_true = []

    all_true_labels = []
    all_pred_labels = []

    model.eval()

    with torch.no_grad():

        for batch in tqdm(valid_data_loader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            predictions = model(b_input_ids,b_input_mask)

            loss = criterion(predictions, b_labels)
            epoch_loss += loss.item()

            predictions = predictions.detach().cpu().numpy()

            label_ids = b_labels.to('cpu').numpy()

            pred,true = predictions_labels(predictions,label_ids)

            all_pred_labels.extend(pred)
            all_true_labels.extend(true)

    print(classification_report(all_pred_labels,all_true_labels))
    avg_val_accuracy = accuracy_score(all_pred_labels,all_true_labels)
    macro_f1_score = f1_score(all_pred_labels,all_true_labels,average='macro')

    avg_val_loss = epoch_loss/len(valid_data_loader)

    print("accuracy = {0:.2f}".format(avg_val_accuracy))

    return avg_val_loss,avg_val_accuracy,macro_f1_score

In [None]:
# Time for training

import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Set device and gpu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

CNNmodel.cuda()

CNNForNLP(
  (embedding): Embedding(31002, 768)
  (convs): ModuleList(
    (0): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

# Training

In [None]:
tempname = "./cnn_aug_model2-4"

In [None]:
epochs = 20

best_macro_f1 = float('0')

for epoch in range(epochs):

    start_time = time.time()
    train_loss,train_acc = train(CNNmodel)
    valid_loss,valid_acc,macro_f1 = eval(CNNmodel)
    end_time = time.time()


    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
    torch.save(CNNmodel, tempname +'_task2a_2.pt')
    print("model saved")

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. acc: {valid_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. F1: {macro_f1*100:.2f}%')
    print('=============Epoch Ended==============')

100%|██████████| 211/211 [00:04<00:00, 45.47it/s]
100%|██████████| 53/53 [00:00<00:00, 557.92it/s]


              precision    recall  f1-score   support

           0       0.98      0.71      0.83      6456
           1       0.09      0.69      0.16       268

    accuracy                           0.71      6724
   macro avg       0.54      0.70      0.49      6724
weighted avg       0.95      0.71      0.80      6724

accuracy = 0.71
model saved
Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.617 | Train acc: 66.56%
	 Val. Loss: 0.573 |  Val. acc: 71.33%
	 Val. Loss: 0.573 |  Val. F1: 49.44%


100%|██████████| 211/211 [00:02<00:00, 95.36it/s]
100%|██████████| 53/53 [00:00<00:00, 490.74it/s]


              precision    recall  f1-score   support

           0       0.98      0.72      0.83      6334
           1       0.14      0.71      0.23       390

    accuracy                           0.72      6724
   macro avg       0.56      0.72      0.53      6724
weighted avg       0.93      0.72      0.80      6724

accuracy = 0.72
model saved
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 0.563 | Train acc: 71.60%
	 Val. Loss: 0.549 |  Val. acc: 72.19%
	 Val. Loss: 0.549 |  Val. F1: 52.92%


100%|██████████| 211/211 [00:02<00:00, 92.47it/s]
100%|██████████| 53/53 [00:00<00:00, 435.97it/s]


              precision    recall  f1-score   support

           0       0.97      0.73      0.83      6234
           1       0.17      0.72      0.28       490

    accuracy                           0.73      6724
   macro avg       0.57      0.73      0.56      6724
weighted avg       0.91      0.73      0.79      6724

accuracy = 0.73
model saved
Epoch: 03 | Epoch Time: 0m 2s
	Train Loss: 0.534 | Train acc: 73.41%
	 Val. Loss: 0.529 |  Val. acc: 72.99%
	 Val. Loss: 0.529 |  Val. F1: 55.69%


100%|██████████| 211/211 [00:02<00:00, 92.11it/s]
100%|██████████| 53/53 [00:00<00:00, 554.68it/s]


              precision    recall  f1-score   support

           0       0.96      0.75      0.84      6009
           1       0.26      0.74      0.39       715

    accuracy                           0.75      6724
   macro avg       0.61      0.75      0.61      6724
weighted avg       0.89      0.75      0.79      6724

accuracy = 0.75
model saved
Epoch: 04 | Epoch Time: 0m 2s
	Train Loss: 0.511 | Train acc: 75.07%
	 Val. Loss: 0.514 |  Val. acc: 74.91%
	 Val. Loss: 0.514 |  Val. F1: 61.41%


100%|██████████| 211/211 [00:02<00:00, 92.30it/s]
100%|██████████| 53/53 [00:00<00:00, 557.90it/s]


              precision    recall  f1-score   support

           0       0.96      0.76      0.85      5913
           1       0.30      0.74      0.42       811

    accuracy                           0.76      6724
   macro avg       0.63      0.75      0.63      6724
weighted avg       0.88      0.76      0.79      6724

accuracy = 0.76
model saved
Epoch: 05 | Epoch Time: 0m 2s
	Train Loss: 0.493 | Train acc: 76.17%
	 Val. Loss: 0.502 |  Val. acc: 75.62%
	 Val. Loss: 0.502 |  Val. F1: 63.45%


100%|██████████| 211/211 [00:02<00:00, 96.32it/s]
100%|██████████| 53/53 [00:00<00:00, 542.90it/s]


              precision    recall  f1-score   support

           0       0.96      0.76      0.85      5905
           1       0.30      0.75      0.43       819

    accuracy                           0.76      6724
   macro avg       0.63      0.76      0.64      6724
weighted avg       0.88      0.76      0.80      6724

accuracy = 0.76
model saved
Epoch: 06 | Epoch Time: 0m 2s
	Train Loss: 0.480 | Train acc: 76.95%
	 Val. Loss: 0.495 |  Val. acc: 75.89%
	 Val. Loss: 0.495 |  Val. F1: 63.92%


100%|██████████| 211/211 [00:02<00:00, 92.68it/s]
100%|██████████| 53/53 [00:00<00:00, 563.81it/s]


              precision    recall  f1-score   support

           0       0.94      0.77      0.85      5701
           1       0.37      0.73      0.49      1023

    accuracy                           0.77      6724
   macro avg       0.65      0.75      0.67      6724
weighted avg       0.85      0.77      0.79      6724

accuracy = 0.77
model saved
Epoch: 07 | Epoch Time: 0m 2s
	Train Loss: 0.467 | Train acc: 77.67%
	 Val. Loss: 0.485 |  Val. acc: 76.67%
	 Val. Loss: 0.485 |  Val. F1: 66.77%


100%|██████████| 211/211 [00:02<00:00, 93.39it/s]
100%|██████████| 53/53 [00:00<00:00, 499.69it/s]


              precision    recall  f1-score   support

           0       0.93      0.78      0.85      5610
           1       0.39      0.72      0.51      1114

    accuracy                           0.77      6724
   macro avg       0.66      0.75      0.68      6724
weighted avg       0.84      0.77      0.79      6724

accuracy = 0.77
model saved
Epoch: 08 | Epoch Time: 0m 2s
	Train Loss: 0.455 | Train acc: 78.59%
	 Val. Loss: 0.482 |  Val. acc: 76.92%
	 Val. Loss: 0.482 |  Val. F1: 67.80%


100%|██████████| 211/211 [00:02<00:00, 94.05it/s]
100%|██████████| 53/53 [00:00<00:00, 514.56it/s]


              precision    recall  f1-score   support

           0       0.94      0.78      0.85      5660
           1       0.39      0.74      0.51      1064

    accuracy                           0.77      6724
   macro avg       0.66      0.76      0.68      6724
weighted avg       0.85      0.77      0.80      6724

accuracy = 0.77
model saved
Epoch: 09 | Epoch Time: 0m 2s
	Train Loss: 0.443 | Train acc: 79.14%
	 Val. Loss: 0.476 |  Val. acc: 77.33%
	 Val. Loss: 0.476 |  Val. F1: 68.03%


100%|██████████| 211/211 [00:02<00:00, 95.49it/s]
100%|██████████| 53/53 [00:00<00:00, 499.03it/s]


              precision    recall  f1-score   support

           0       0.93      0.79      0.85      5569
           1       0.41      0.73      0.53      1155

    accuracy                           0.78      6724
   macro avg       0.67      0.76      0.69      6724
weighted avg       0.84      0.78      0.80      6724

accuracy = 0.78
model saved
Epoch: 10 | Epoch Time: 0m 2s
	Train Loss: 0.434 | Train acc: 79.55%
	 Val. Loss: 0.472 |  Val. acc: 77.62%
	 Val. Loss: 0.472 |  Val. F1: 69.05%


100%|██████████| 211/211 [00:02<00:00, 94.68it/s]
100%|██████████| 53/53 [00:00<00:00, 552.10it/s]


              precision    recall  f1-score   support

           0       0.94      0.78      0.85      5620
           1       0.40      0.74      0.52      1104

    accuracy                           0.78      6724
   macro avg       0.67      0.76      0.69      6724
weighted avg       0.85      0.78      0.80      6724

accuracy = 0.78
model saved
Epoch: 11 | Epoch Time: 0m 2s
	Train Loss: 0.424 | Train acc: 80.37%
	 Val. Loss: 0.468 |  Val. acc: 77.57%
	 Val. Loss: 0.468 |  Val. F1: 68.64%


100%|██████████| 211/211 [00:02<00:00, 92.64it/s]
100%|██████████| 53/53 [00:00<00:00, 524.37it/s]


              precision    recall  f1-score   support

           0       0.93      0.79      0.86      5549
           1       0.43      0.74      0.54      1175

    accuracy                           0.78      6724
   macro avg       0.68      0.76      0.70      6724
weighted avg       0.84      0.78      0.80      6724

accuracy = 0.78
model saved
Epoch: 12 | Epoch Time: 0m 2s
	Train Loss: 0.414 | Train acc: 81.13%
	 Val. Loss: 0.464 |  Val. acc: 78.00%
	 Val. Loss: 0.464 |  Val. F1: 69.72%


100%|██████████| 211/211 [00:02<00:00, 92.04it/s]
100%|██████████| 53/53 [00:00<00:00, 524.75it/s]


              precision    recall  f1-score   support

           0       0.93      0.80      0.86      5472
           1       0.45      0.73      0.55      1252

    accuracy                           0.78      6724
   macro avg       0.69      0.76      0.71      6724
weighted avg       0.84      0.78      0.80      6724

accuracy = 0.78
model saved
Epoch: 13 | Epoch Time: 0m 2s
	Train Loss: 0.405 | Train acc: 81.54%
	 Val. Loss: 0.461 |  Val. acc: 78.26%
	 Val. Loss: 0.461 |  Val. F1: 70.55%


100%|██████████| 211/211 [00:02<00:00, 93.31it/s]
100%|██████████| 53/53 [00:00<00:00, 587.65it/s]


              precision    recall  f1-score   support

           0       0.93      0.79      0.86      5500
           1       0.44      0.74      0.55      1224

    accuracy                           0.78      6724
   macro avg       0.69      0.77      0.71      6724
weighted avg       0.84      0.78      0.80      6724

accuracy = 0.78
model saved
Epoch: 14 | Epoch Time: 0m 2s
	Train Loss: 0.398 | Train acc: 82.14%
	 Val. Loss: 0.458 |  Val. acc: 78.41%
	 Val. Loss: 0.458 |  Val. F1: 70.58%


100%|██████████| 211/211 [00:02<00:00, 94.58it/s]
100%|██████████| 53/53 [00:00<00:00, 546.18it/s]


              precision    recall  f1-score   support

           0       0.93      0.79      0.86      5501
           1       0.44      0.74      0.56      1223

    accuracy                           0.78      6724
   macro avg       0.69      0.77      0.71      6724
weighted avg       0.84      0.78      0.80      6724

accuracy = 0.78
model saved
Epoch: 15 | Epoch Time: 0m 2s
	Train Loss: 0.392 | Train acc: 82.62%
	 Val. Loss: 0.456 |  Val. acc: 78.48%
	 Val. Loss: 0.456 |  Val. F1: 70.67%


100%|██████████| 211/211 [00:02<00:00, 94.19it/s]
100%|██████████| 53/53 [00:00<00:00, 539.28it/s]


              precision    recall  f1-score   support

           0       0.93      0.80      0.86      5510
           1       0.45      0.75      0.56      1214

    accuracy                           0.79      6724
   macro avg       0.69      0.77      0.71      6724
weighted avg       0.85      0.79      0.81      6724

accuracy = 0.79
model saved
Epoch: 16 | Epoch Time: 0m 2s
	Train Loss: 0.384 | Train acc: 83.02%
	 Val. Loss: 0.453 |  Val. acc: 78.70%
	 Val. Loss: 0.453 |  Val. F1: 70.92%


100%|██████████| 211/211 [00:02<00:00, 96.48it/s]
100%|██████████| 53/53 [00:00<00:00, 540.78it/s]


              precision    recall  f1-score   support

           0       0.93      0.80      0.86      5482
           1       0.45      0.74      0.56      1242

    accuracy                           0.79      6724
   macro avg       0.69      0.77      0.71      6724
weighted avg       0.84      0.79      0.81      6724

accuracy = 0.79
model saved
Epoch: 17 | Epoch Time: 0m 2s
	Train Loss: 0.376 | Train acc: 83.36%
	 Val. Loss: 0.451 |  Val. acc: 78.79%
	 Val. Loss: 0.451 |  Val. F1: 71.21%


100%|██████████| 211/211 [00:02<00:00, 96.94it/s]
100%|██████████| 53/53 [00:00<00:00, 518.24it/s]


              precision    recall  f1-score   support

           0       0.93      0.80      0.86      5460
           1       0.46      0.74      0.57      1264

    accuracy                           0.79      6724
   macro avg       0.70      0.77      0.72      6724
weighted avg       0.84      0.79      0.81      6724

accuracy = 0.79
model saved
Epoch: 18 | Epoch Time: 0m 2s
	Train Loss: 0.365 | Train acc: 84.29%
	 Val. Loss: 0.450 |  Val. acc: 78.94%
	 Val. Loss: 0.450 |  Val. F1: 71.55%


100%|██████████| 211/211 [00:02<00:00, 95.79it/s]
100%|██████████| 53/53 [00:00<00:00, 563.80it/s]


              precision    recall  f1-score   support

           0       0.93      0.80      0.86      5475
           1       0.46      0.75      0.57      1249

    accuracy                           0.79      6724
   macro avg       0.70      0.78      0.72      6724
weighted avg       0.85      0.79      0.81      6724

accuracy = 0.79
model saved
Epoch: 19 | Epoch Time: 0m 2s
	Train Loss: 0.360 | Train acc: 84.80%
	 Val. Loss: 0.449 |  Val. acc: 79.13%
	 Val. Loss: 0.449 |  Val. F1: 71.72%


100%|██████████| 211/211 [00:02<00:00, 95.01it/s]
100%|██████████| 53/53 [00:00<00:00, 569.90it/s]


              precision    recall  f1-score   support

           0       0.94      0.80      0.86      5492
           1       0.46      0.75      0.57      1232

    accuracy                           0.79      6724
   macro avg       0.70      0.78      0.72      6724
weighted avg       0.85      0.79      0.81      6724

accuracy = 0.79
model saved
Epoch: 20 | Epoch Time: 0m 2s
	Train Loss: 0.349 | Train acc: 85.44%
	 Val. Loss: 0.446 |  Val. acc: 79.09%
	 Val. Loss: 0.446 |  Val. F1: 71.56%


In [None]:
# Save BETO and CNN

# torch.save(CNNmodel, tempname + 'module2_part1.pt')


## EVALUATING

In [None]:
# Load phobert and cnn

import torch
CNNmodel = torch.load(tempname + '_task2a_2.pt')
CNNmodel.eval()



CNNForNLP(
  (embedding): Embedding(31002, 768)
  (convs): ModuleList(
    (0): Conv1d(768, 32, kernel_size=(3,), stride=(1,))
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

Predict label from true label

In [None]:
test_sent_index, test_input_ids, test_attention_masks, test_encoded_label_tensors = encoder_generator(test_sentences,test_labels)
test_dataset = TensorDataset(test_input_ids,test_attention_masks,test_encoded_label_tensors)

test_data_loader = DataLoader(test_dataset,
                              sampler=RandomSampler(test_dataset),
                              batch_size=bs)

all_pred_labels = []
all_true_labels = []

with torch.no_grad():
  for batch in tqdm(test_data_loader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    predictions = CNNmodel(b_input_ids,b_input_mask)


    predictions = predictions.detach().cpu().numpy()

    label_ids = b_labels.to('cpu').numpy()

    pred, true = predictions_labels(predictions, label_ids)

    all_pred_labels.extend(pred)
    all_true_labels.extend(true)

100%|██████████| 28/28 [00:00<00:00, 560.01it/s]


In [None]:
# The final score in the test set (classification report)

# print(classification_report(all_pred_labels,all_true_labels, digits = 4))
print(classification_report(all_true_labels,all_pred_labels, digits = 4))

              precision    recall  f1-score   support

           0     0.8887    0.9263    0.9071      2932
           1     0.5773    0.4646    0.5148       635

    accuracy                         0.8441      3567
   macro avg     0.7330    0.6954    0.7110      3567
weighted avg     0.8333    0.8441    0.8373      3567



In [None]:
# Confusion matrix in thetest set

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_true_labels, all_pred_labels)
cm

array([[2745,  187],
       [ 342,  293]], dtype=int64)