The codes presented here was based on https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [1]:
import numpy as np
import random
import pandas as pd
import time
import datetime
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.word as naw

In [1]:
!pip install adapter-transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting adapter-transformers
  Downloading adapter_transformers-3.1.0-py3-none-any.whl (4.8 MB)
     ---------------------------------------- 4.8/4.8 MB 6.6 MB/s eta 0:00:00
Collecting numpy>=1.17
  Downloading numpy-1.24.1-cp39-cp39-win_amd64.whl (14.9 MB)
     ---------------------------------------- 14.9/14.9 MB 8.0 MB/s eta 0:00:00
Collecting regex!=2019.12.17
  Downloading regex-2022.10.31-cp39-cp39-win_amd64.whl (267 kB)
     -------------------------------------- 267.8/267.8 KB 8.3 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
     -------------------------------------- 182.4/182.4 KB 5.6 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting tqdm>=4.27
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
     ---------------------------------------- 78.5/78.5 KB 4.5 MB/s eta 0:00:00

You should consider upgrading via the 'C:\Users\Priscila\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [10]:
modelPath = "Model/BertBaseMultUncased/"

In [2]:
cleaning_data = pd.read_csv('Data/Tweet_Processed_DataCleaning_Done.csv')

In [3]:
cleaning_data.head()

Unnamed: 0,Tweet,Label
0,euedsonduarte lilovlog jairbolsonaro exatamen...,0.0
1,a china fecha o primeiro laboratorio do mundo ...,0.0
2,janeiro china mente sobre a de mortos nos caso...,0.0
3,nivel de poluicao na china cai drasticamente a...,0.0
4,eikebatista os que cruzam os oceanos trazem u...,0.0


In [4]:
x_data = cleaning_data['Tweet']
y_data = cleaning_data['Label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, stratify=y_data, random_state=43)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=43)

### Data Augmentation

In [None]:
#!pip install nlpaug

In [None]:
np.count_nonzero(y_train == 1)

In [None]:
np.count_nonzero(y_train == 0)

In [5]:
indices_y = np.where(y_data == 1)
indices_y = np.asarray(indices_y)
indices_y = indices_y.flatten()

In [6]:
x_train_aug = np.take(x_data, indices_y)
x_train_aug.shape

(2034,)

In [7]:
x_train_aug

10       a teoria da conspiracao de hoje ta sendo a chi...
19       sem comentarios - video - pastor diz que china...
22        biakicis gleisi vai para china sua coronaviru...
35       melhor explicacao - coronavirus e igual macarr...
37       c a china enviar coronavirus pra ca, enviaremo...
                               ...                        
24131     rodrigomaia voce nasceu no chile deveria volt...
24139    mano imagina gestar uma crianca por nove meses...
24144     jbsantz covid xing ling kkkkk essa prof gosta...
24179    covid- o virus chines ou o crime chines? veja ...
24188     dmaia jairbolsonaro vou deixar o mesmo recado...
Name: Tweet, Length: 2034, dtype: object

In [8]:
y_train_aug = np.take(x_data, indices_y)

In [11]:
aug_syn = naw.SynonymAug(aug_src='wordnet', model_path=modelPath, lang ='por')

In [12]:
for sentence in x_train_aug:
    for ii in range(10):
        text = np.array(aug_syn.augment(sentence))
        np.append(x_data, text)#lst.append(text)
        label = np.array(1)
        np.append(y_data, label)

In [None]:
x_train_aug

In [13]:
train_data = pd.DataFrame({'text': x_data, 'labels': y_data})

In [14]:
train_data.to_csv('DataAugmentation.csv')

In [None]:
train_data.count()

In [None]:
train_data.iloc[[2840]].text

In [None]:
valid_data = pd.DataFrame({'text': x_valid, 'labels': y_valid})

In [None]:
valid_data.count()

In [None]:
valid_data.head()

In [None]:
#y_train = y_train.values

In [None]:
#x_test = x_test.values
#y_test = y_test.values

#x_valid = x_valid.values
#y_valid = y_valid.values

In [None]:
import torch
from transformers import BertTokenizer

In [None]:
torch.cuda.empty_cache()
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

#If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
def preProcessing(data, maxLength, tokenizer):    
    for tweet in data:
        encoded = tokenizer.encode_plus(
                    tweet, #Sentence to be tokenized
                    add_special_tokens=True, #Adding [CLS] a token added to beggining of the setence
                                             #and [SEP] a token added to end of the setence
                    max_length=maxLength, #the max size of the setence
                    padding='max_length',#pad_to_max_length = True, #Adding [PAD] a token that represents the real sentence(when the setence is  
                                              #smaller than the max size the spaces will be completed with this token)                                              
                    return_attention_mask=True, #An array of 0 and 1 indicating which tokens are [PAD](space in blank) 
                                                #and the tokens belonging to the sentence
                    return_tensors = 'pt', #Return pytorch tensors(the same as numpy array)
        )
        
        
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])    

In [None]:
#Loading locally the bert model
bert_tokenizer = BertTokenizer.from_pretrained(modelPath, local_files_only=True, do_lower_case=True)

input_ids = []
attention_masks = []

# Tokenizing the senteces of the train dataset
preProcessing(x_train, 512, bert_tokenizer)

#torch.cat concatenate the tensors 
inputIdTrain = torch.cat(input_ids, dim=0)
attentionMaskTrain = torch.cat(attention_masks, dim=0)
labelsTrain = torch.tensor(y_train)
labelsTrain = torch.tensor(y_train).unsqueeze(1)
labelsTrain = labelsTrain.to(torch.int64)


input_ids = []
attention_masks = []

# Tokenizing the senteces of the train dataset
preProcessing(x_valid, 512, bert_tokenizer)

#torch.cat concatenate the tensors 
inputIdValid = torch.cat(input_ids, dim=0)
attentionMaskValid = torch.cat(attention_masks, dim=0)
labelsValid = torch.tensor(y_valid)
labelsValid = torch.tensor(y_valid).unsqueeze(1)
labelsValid = labelsValid.to(torch.int64)


In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split

In [None]:
#!pip install datasets

In [None]:
from datasets import Dataset

In [None]:
#TensorDataset encapsulates the data
#train_data = TensorDataset(inputIdTrain, attentionMaskTrain, labelsTrain)
#valid_data = TensorDataset(inputIdValid, attentionMaskValid, labelsValid)

In [None]:
train_dataset = Dataset.from_pandas(train_data)

In [None]:
valid_dataset = Dataset.from_pandas(valid_data)

In [None]:
def processingToken(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return bert_tokenizer(batch["text"], max_length=512, truncation=True, padding="max_length")

In [None]:
# Encode the input data
train_dataset = train_dataset.map(processingToken, batched=True)
# Transform to pytorch tensors and only output the required columns
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Encode the input data
valid_dataset = valid_dataset.map(processingToken, batched=True)
# Transform to pytorch tensors and only output the required columns
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
from transformers import BertConfig, BertModelWithHeads

In [None]:
config = BertConfig.from_pretrained(
    modelPath, 
    local_files_only=True,
    num_labels=2,
)

In [None]:
model = BertModelWithHeads.from_pretrained(
    modelPath, 
    local_files_only=True,
    config=config,
)

In [None]:
# Add a new adapter
model.add_adapter("rotten_tomatoes")
# Add a matching classification head
model.add_classification_head(
    "rotten_tomatoes",
    num_labels=2
  )
# Activate the adapter
model.train_adapter("rotten_tomatoes")

In [None]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

In [None]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

In [None]:
def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

In [None]:
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_accuracy,
)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import gc
gc.collect()

In [None]:
trainer.train()

### Test