In [1]:
!nvidia-smi

Fri May 20 12:47:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   63C    P8     9W /  N/A |      0MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Imports

In [2]:
from datasets import Dataset, DatasetDict, load_dataset
import warnings
import numpy as np
import pandas as pd
import json
from transformers.adapters.composition import Fuse
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoModelWithHeads,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    default_data_collator,
    PreTrainedTokenizerFast,
    AutoAdapterModel,
    AdapterConfig,
    AutoModelForQuestionAnswering,
)
from transformers import AdapterConfig
from transformers.adapters.composition import Stack

warnings.filterwarnings("ignore")


## Load Dataset

In [3]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [4]:
# training data
language = 'vi'
input_file_path = f'../SQuAD/translate-train/squad.translate.train.en-{language}.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path,verbose=0)

def get_answers(x):
    start = x[0]
    text = x[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train['answers'] = train[['answer_start', 'text']].apply(get_answers, axis=1)
pd.set_option('display.max_colwidth',None)
train.head(1)

Unnamed: 0,index,question,context,answer_start,text,c_id,answers
0,5726e1755951b619008f8141,Tên của nhà vật lý người Hà Lan đã phát minh ra Leyden Jar là gì?,"Vào tháng 10 năm 1745, Ewald Georg von Kleist ở Pomerania, Đức, nhận thấy rằng điện tích có thể được lưu trữ bằng cách kết nối một máy phát tĩnh điện cao áp bằng một sợi dây với một thể tích nước trong một lọ thủy tinh cầm tay. Tay và nước của Von Kleist đóng vai trò là chất dẫn điện và bình như một chất điện môi (mặc dù chi tiết về cơ chế được xác định không chính xác tại thời điểm đó). Von Kleist nhận thấy rằng việc chạm vào dây dẫn đến một tia lửa mạnh mẽ, đau đớn hơn nhiều so với việc lấy từ máy tĩnh điện. Năm sau, nhà vật lý người Hà Lan Pieter van Musschenbroek đã phát minh ra một tụ điện tương tự, được đặt tên là bình Leyden, sau Đại học Leiden nơi ông làm việc. Ông cũng bị ấn tượng bởi sức mạnh của cú sốc mà ông nhận được, viết: ""Tôi sẽ không chịu cú sốc thứ hai cho vương quốc Pháp.""",549,Pieter van Musschenbroek,0,"{'answer_start': [549], 'text': ['Pieter van Musschenbroek']}"


In [5]:
# Validation data
language = 'hi'
input_file_path = f'../SQuAD/translate-dev/squad.translate.dev.en-{language}.json'
record_path = ['data','paragraphs','qas','answers']
dev = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path,verbose=0)
dev['answers'] = dev[['answer_start', 'text']].apply(get_answers, axis=1)
pd.set_option('display.max_colwidth',None)
dev.head(1)

Unnamed: 0,index,question,context,answer_start,text,c_id,answers
0,56e1c9bfe3433e1400423196,बहुपद समय में कमी क्या इसका एक उदाहरण है?,"कमी की अवधारणा का उपयोग करके कई जटिलता वर्गों को परिभाषित किया गया है। एक कमी एक समस्या का दूसरी समस्या में परिवर्तन है। यह एक समस्या की अनौपचारिक धारणा को कम से कम एक और समस्या के रूप में मुश्किल बनाता है। उदाहरण के लिए, यदि कोई समस्या Y के लिए एल्गोरिथ्म का उपयोग करके X को हल किया जा सकता है, तो X, Y से अधिक कठिन नहीं है, और हम कहते हैं कि X, Y को कम कर देता है। कई अलग-अलग प्रकार के कटौती हैं, जिनके आधार पर कटौती की विधि, जैसे कि कुक रिडक्शन, कार्प रिडक्शन और लेविन रिडक्शन, और पोलिनेशन-टाइम रिडक्शन या लॉग-स्पेस रिडक्शन जैसी कटौती की जटिलता पर बाध्य होती है।",378,प्रकार के कटौती,0,"{'answer_start': [378], 'text': ['प्रकार के कटौती']}"


In [6]:
tds = Dataset.from_pandas(train.sample(100))
vds = Dataset.from_pandas(dev.sample(100))


squad = DatasetDict()

squad['train'] = tds
squad['validation'] = vds

In [7]:
squad

DatasetDict({
    train: Dataset({
        features: ['index', 'question', 'context', 'answer_start', 'text', 'c_id', 'answers', '__index_level_0__'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['index', 'question', 'context', 'answer_start', 'text', 'c_id', 'answers', '__index_level_0__'],
        num_rows: 100
    })
})

In [8]:
# squad_en = load_dataset("squad")
# squad_en

## Create QA Features

In [9]:

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
assert isinstance(tokenizer, PreTrainedTokenizerFast)
pad_on_right = tokenizer.padding_side == "right"

In [10]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["index"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [11]:
squad = squad.map(
    prepare_train_features, batched=True, remove_columns=squad["train"].column_names
)

# squad_en = squad_en.map(
#     prepare_train_features, batched=True, remove_columns=squad_en["train"].column_names
# )

# squad_vi = squad_vi.map(
#     prepare_train_features, batched=True, remove_columns=squad_vi["train"].column_names
# )


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Load Model and Adapters

### Enable EN only for training

In [12]:
language = 'hi'
model = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base')
config = AdapterConfig.load("pfeiffer",non_linearity="relu", reduction_factor=2)
adapter_name_1 = model.load_adapter("en/wiki@ukp", config=config,model_name='xlm-roberta-base')
adapter_name_2 = model.load_adapter("hi/wiki@ukp",config=config)
task_adapter = model.load_adapter("AdapterHub/roberta-base-pf-squad", source="hf",load_as = 'pfeiffer_xlm_base')
model.set_active_adapters(adapter_name_2,task_adapter)
model.train_adapter('pfeiffer_xlm_base')

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream tas

## Training



In [13]:
batch_size = 4

In [14]:
args = TrainingArguments(
    f"./{language}-adapter-{batch_size}",
    evaluation_strategy = "epoch",
    save_strategy= "epoch",
    learning_rate = 3e-5,
    warmup_ratio = 0.1,
    gradient_accumulation_steps = 8,
    num_train_epochs = 5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    seed=42,
    overwrite_output_dir=True,
    save_total_limit=1,
    load_best_model_at_end=True)

In [15]:
data_collator = default_data_collator

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=squad["train"],
    eval_dataset=squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 103
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 15


  0%|          | 0/15 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 110
  Batch size = 4


  0%|          | 0/28 [00:00<?, ?it/s]

Saving model checkpoint to ./hi-adapter-4\checkpoint-3
Configuration saved in ./hi-adapter-4\checkpoint-3\config.json


{'eval_loss': 5.965855598449707, 'eval_runtime': 2.9885, 'eval_samples_per_second': 36.808, 'eval_steps_per_second': 9.369, 'epoch': 0.92}


Model weights saved in ./hi-adapter-4\checkpoint-3\pytorch_model.bin
tokenizer config file saved in ./hi-adapter-4\checkpoint-3\tokenizer_config.json
Special tokens file saved in ./hi-adapter-4\checkpoint-3\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 110
  Batch size = 4


  0%|          | 0/28 [00:00<?, ?it/s]

Saving model checkpoint to ./hi-adapter-4\checkpoint-6
Configuration saved in ./hi-adapter-4\checkpoint-6\config.json


{'eval_loss': 5.961741924285889, 'eval_runtime': 2.488, 'eval_samples_per_second': 44.212, 'eval_steps_per_second': 11.254, 'epoch': 1.92}


Model weights saved in ./hi-adapter-4\checkpoint-6\pytorch_model.bin
tokenizer config file saved in ./hi-adapter-4\checkpoint-6\tokenizer_config.json
Special tokens file saved in ./hi-adapter-4\checkpoint-6\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 110
  Batch size = 4


  0%|          | 0/28 [00:00<?, ?it/s]

Saving model checkpoint to ./hi-adapter-4\checkpoint-9
Configuration saved in ./hi-adapter-4\checkpoint-9\config.json


{'eval_loss': 5.958675384521484, 'eval_runtime': 2.4795, 'eval_samples_per_second': 44.364, 'eval_steps_per_second': 11.293, 'epoch': 2.92}


Model weights saved in ./hi-adapter-4\checkpoint-9\pytorch_model.bin
tokenizer config file saved in ./hi-adapter-4\checkpoint-9\tokenizer_config.json
Special tokens file saved in ./hi-adapter-4\checkpoint-9\special_tokens_map.json
Deleting older checkpoint [hi-adapter-4\checkpoint-3] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 110
  Batch size = 4


  0%|          | 0/28 [00:00<?, ?it/s]

Saving model checkpoint to ./hi-adapter-4\checkpoint-12
Configuration saved in ./hi-adapter-4\checkpoint-12\config.json


{'eval_loss': 5.9567437171936035, 'eval_runtime': 2.4794, 'eval_samples_per_second': 44.366, 'eval_steps_per_second': 11.293, 'epoch': 3.92}


Model weights saved in ./hi-adapter-4\checkpoint-12\pytorch_model.bin
tokenizer config file saved in ./hi-adapter-4\checkpoint-12\tokenizer_config.json
Special tokens file saved in ./hi-adapter-4\checkpoint-12\special_tokens_map.json
Deleting older checkpoint [hi-adapter-4\checkpoint-6] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 110
  Batch size = 4


  0%|          | 0/28 [00:00<?, ?it/s]

Saving model checkpoint to ./hi-adapter-4\checkpoint-15
Configuration saved in ./hi-adapter-4\checkpoint-15\config.json


{'eval_loss': 5.955974102020264, 'eval_runtime': 2.509, 'eval_samples_per_second': 43.842, 'eval_steps_per_second': 11.16, 'epoch': 4.92}


Model weights saved in ./hi-adapter-4\checkpoint-15\pytorch_model.bin
tokenizer config file saved in ./hi-adapter-4\checkpoint-15\tokenizer_config.json
Special tokens file saved in ./hi-adapter-4\checkpoint-15\special_tokens_map.json
Deleting older checkpoint [hi-adapter-4\checkpoint-9] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./hi-adapter-4\checkpoint-15 (score: 5.955974102020264).


{'train_runtime': 180.5387, 'train_samples_per_second': 2.853, 'train_steps_per_second': 0.083, 'train_loss': 6.3682708740234375, 'epoch': 4.92}


TrainOutput(global_step=15, training_loss=6.3682708740234375, metrics={'train_runtime': 180.5387, 'train_samples_per_second': 2.853, 'train_steps_per_second': 0.083, 'train_loss': 6.3682708740234375, 'epoch': 4.92})

In [18]:
trainer.model

XLMRobertaForQuestionAnswering(
  (shared_parameters): ModuleDict()
  (roberta): RobertaModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict(
      (en): NICECouplingBlock(
        (F): Sequential(
          (0): Linear(in_features=384, out_features=192, bias=True)
          (1): Activation_Function_Class(
            (f): ReLU()
          )
          (2): Linear(in_features=192, out_features=384, bias=True)
        )
        (G): Sequential(
          (0): Linear(in_features=384, out_features=192, bias=True)
          (1): Activation_Function_Class(
            (f): ReLU()
          )
          (2): Linear(in_features=192, out_features=384, bias=True)
        )
      )
      (hi): NICECouplingBlock(
        (F): Sequential(
          (0): Linear(in_features=384, out_features=192, bias=True)
          (1): Activation_Function_Class(
            (f): ReLU()
          )
          (2): Linear(in_features=192, out_features=384, bias=True)
        )
        (G)

In [19]:
trainer.save_model(f"./{language}-adapter-{batch_size}")

Saving model checkpoint to ./hi-adapter-4
Configuration saved in ./hi-adapter-4\config.json
Model weights saved in ./hi-adapter-4\pytorch_model.bin
tokenizer config file saved in ./hi-adapter-4\tokenizer_config.json
Special tokens file saved in ./hi-adapter-4\special_tokens_map.json


In [20]:
model.save_adapter('./hi-adapter-4/qa-adapter',"pfeiffer_xlm_base")

Configuration saved in ./hi-adapter-4/qa-adapter\adapter_config.json
Module weights saved in ./hi-adapter-4/qa-adapter\pytorch_adapter.bin
Configuration saved in ./hi-adapter-4/qa-adapter\head_config.json
Module weights saved in ./hi-adapter-4/qa-adapter\pytorch_model_head.bin


In [22]:
model.save_adapter('./hi-adapter-4/hi-adapter',"hi")

Configuration saved in ./hi-adapter-4/hi-adapter\adapter_config.json
Module weights saved in ./hi-adapter-4/hi-adapter\pytorch_adapter.bin
Configuration saved in ./hi-adapter-4/hi-adapter\head_config.json
Module weights saved in ./hi-adapter-4/hi-adapter\pytorch_model_head.bin


In [23]:
model.save_adapter('./hi-adapter-4/en-adapter',"en")

Configuration saved in ./hi-adapter-4/en-adapter\adapter_config.json
Module weights saved in ./hi-adapter-4/en-adapter\pytorch_adapter.bin
Configuration saved in ./hi-adapter-4/en-adapter\head_config.json
Module weights saved in ./hi-adapter-4/en-adapter\pytorch_model_head.bin


: 