In [1]:
import numpy as np
import random
import pandas as pd
import time
import datetime
from sklearn.model_selection import train_test_split

In [2]:
#!pip install transformers
!pip install -U adapter-transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting adapter-transformers
  Downloading adapter_transformers-3.1.0-py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, adapter-transformers
Successfully installed adapter-transformers-3.1.0 huggingface-hub-0.11.1 tokenizers-0.12.1
Looking in indexes: https

In [3]:
train_data = pd.read_csv('DataAugmentation.csv')

In [4]:
train_data = train_data[['text', 'labels']]

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24200 entries, 0 to 24199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    24200 non-null  object 
 1   labels  24200 non-null  float64
dtypes: float64(1), object(1)
memory usage: 378.2+ KB


In [6]:
train_data['labels'] = train_data['labels'].astype(int)

In [7]:
x_data = train_data['text']
y_data = train_data['labels']

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.2, stratify=y_data, random_state=43)

In [9]:
valid_data = pd.DataFrame({'text': x_valid, 'labels': y_valid})

In [10]:
train_data = pd.DataFrame({'text': x_train, 'labels': y_train})

In [11]:
valid_data['labels'] = valid_data['labels'].astype(int)

In [12]:
valid_data.count()

text      4840
labels    4840
dtype: int64

In [13]:
import torch 
from transformers import BertTokenizer

In [14]:
torch.cuda.empty_cache()
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

#If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: A100-SXM4-40GB


In [15]:
#Loading locally the bert model
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

Downloading vocab.txt:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [16]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split

In [17]:
from datasets import Dataset

In [18]:
train_dataset = Dataset.from_pandas(train_data)

In [19]:
valid_dataset = Dataset.from_pandas(valid_data)

In [20]:
def processingToken(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return bert_tokenizer(batch["text"], max_length=512, truncation=True, padding="max_length")

In [21]:
# Encode the input data
train_dataset = train_dataset.map(processingToken, batched=True)
# Transform to pytorch tensors and only output the required columns
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

  0%|          | 0/20 [00:00<?, ?ba/s]

In [22]:
# Encode the input data
valid_dataset = valid_dataset.map(processingToken, batched=True)
# Transform to pytorch tensors and only output the required columns
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

  0%|          | 0/5 [00:00<?, ?ba/s]

In [23]:
valid_dataset = valid_dataset.remove_columns('__index_level_0__')

In [24]:
from transformers import BertConfig, BertAdapterModel

In [25]:
config = BertConfig.from_pretrained(
    "bert-base-multilingual-uncased",
    num_labels=2,    
)

In [26]:
model = BertAdapterModel.from_pretrained(
    "bert-base-multilingual-uncased",
    config=config,
)

Downloading pytorch_model.bin:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertAdapterModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
# Add a new adapter
model.add_adapter("sent_ana_port")
# Add a matching classification head
model.add_classification_head(
    "sent_ana_port",
    num_labels=2
  )
# Activate the adapter
model.train_adapter("sent_ana_port")

In [28]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

In [29]:
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

In [30]:
def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

In [31]:
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_accuracy,
)

In [32]:
trainer.train()

***** Running training *****
  Num examples = 19360
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7260


Step,Training Loss
200,0.273
400,0.1899
600,0.1788
800,0.183
1000,0.1635
1200,0.1668
1400,0.1673
1600,0.1481
1800,0.1424
2000,0.1464


Saving model checkpoint to ./training_output/checkpoint-500
Configuration saved in ./training_output/checkpoint-500/sent_ana_port/adapter_config.json
Module weights saved in ./training_output/checkpoint-500/sent_ana_port/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-500/sent_ana_port/head_config.json
Module weights saved in ./training_output/checkpoint-500/sent_ana_port/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/sent_ana_port/head_config.json
Module weights saved in ./training_output/checkpoint-500/sent_ana_port/pytorch_model_head.bin
Saving model checkpoint to ./training_output/checkpoint-1000
Configuration saved in ./training_output/checkpoint-1000/sent_ana_port/adapter_config.json
Module weights saved in ./training_output/checkpoint-1000/sent_ana_port/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-1000/sent_ana_port/head_config.json
Module weights saved in ./training_output/checkpoint-1000/sent_an

TrainOutput(global_step=7260, training_loss=0.14227075786958385, metrics={'train_runtime': 2024.5813, 'train_samples_per_second': 57.375, 'train_steps_per_second': 3.586, 'total_flos': 3.109293563904e+16, 'train_loss': 0.14227075786958385, 'epoch': 6.0})

In [33]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 4840
  Batch size = 16


{'eval_loss': 0.1486884206533432,
 'eval_acc': 0.9409090909090909,
 'eval_runtime': 38.4482,
 'eval_samples_per_second': 125.884,
 'eval_steps_per_second': 7.881,
 'epoch': 6.0}

In [34]:
model.save_adapter("./final_adapter", "sent_ana_port")

!ls -lh final_adapter

Configuration saved in ./final_adapter/adapter_config.json
Module weights saved in ./final_adapter/pytorch_adapter.bin
Configuration saved in ./final_adapter/head_config.json
Module weights saved in ./final_adapter/pytorch_model_head.bin


total 5.7M
-rw-r--r-- 1 root root 1.1K Jan 23 22:08 adapter_config.json
-rw-r--r-- 1 root root  417 Jan 23 22:08 head_config.json
-rw-r--r-- 1 root root 3.5M Jan 23 22:08 pytorch_adapter.bin
-rw-r--r-- 1 root root 2.3M Jan 23 22:08 pytorch_model_head.bin
