In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!nvidia-smi

Sat May 21 07:22:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install datasets adapter-transformers

Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[?25l[K     |█                               | 10 kB 37.4 MB/s eta 0:00:01[K     |██                              | 20 kB 26.0 MB/s eta 0:00:01[K     |██▉                             | 30 kB 12.3 MB/s eta 0:00:01[K     |███▉                            | 40 kB 10.0 MB/s eta 0:00:01[K     |████▊                           | 51 kB 3.8 MB/s eta 0:00:01[K     |█████▊                          | 61 kB 4.5 MB/s eta 0:00:01[K     |██████▋                         | 71 kB 4.7 MB/s eta 0:00:01[K     |███████▋                        | 81 kB 4.4 MB/s eta 0:00:01[K     |████████▌                       | 92 kB 4.9 MB/s eta 0:00:01[K     |█████████▌                      | 102 kB 4.3 MB/s eta 0:00:01[K     |██████████▍                     | 112 kB 4.3 MB/s eta 0:00:01[K     |███████████▍                    | 122 kB 4.3 MB/s eta 0:00:01[K     |████████████▎                   | 133 kB 4.3 MB/s eta 0:00:01

In [None]:
# pip install --upgrade --force-reinstall pyarrow

In [19]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


## Imports

In [5]:
from datasets import Dataset, DatasetDict, load_dataset
import warnings
import numpy as np
import pandas as pd
import json
from transformers.adapters.composition import Fuse
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoModelWithHeads,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    default_data_collator,
    PreTrainedTokenizerFast,
    AutoAdapterModel,
    AdapterConfig,
    AutoModelForQuestionAnswering,
)
from transformers import AdapterConfig
from transformers.adapters.composition import Stack

warnings.filterwarnings("ignore")

## Load Dataset

In [6]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [8]:
!ls

drive  sample_data


In [9]:
# training data
language = 'hi'
input_file_path = f'./drive/MyDrive/Colab Files/squad.translate.train.en-{language}.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path,verbose=0)

def get_answers(x):
    start = x[0]
    text = x[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train['answers'] = train[['answer_start', 'text']].apply(get_answers, axis=1)
pd.set_option('display.max_colwidth',None)
train.head(1)

Unnamed: 0,index,question,context,answer_start,text,c_id,answers
0,57283b4c3acd2414000df76d,जब लंदन यहूदी फोरम स्थापित किया गया था?,"उत्तरी लंदन में स्टैमफोर्ड हिल, स्टैनमोर, गोल्डर्स ग्रीन, फिंचली, हैम्पस्टेड, हेंडन और एडगवेयर में महत्वपूर्ण यहूदी समुदायों के साथ अधिकांश ब्रिटिश यहूदी लंदन में रहते हैं। लंदन शहर के Bevis मार्क्स आराधनालय लंदन के ऐतिहासिक Sephardic यहूदी समुदाय से सम्बद्ध है। यह यूरोप का एकमात्र ऐसा आराधनालय है जिसने 300 वर्षों से लगातार नियमित सेवाओं का आयोजन किया है। स्टैनमोर और कैनन्स पार्क सिनेगॉग की 1998 में पूरे यूरोप में किसी भी ऑर्थोडॉक्स आराधनालय की सबसे बड़ी सदस्यता है, 1998 में इलफ़र्ड सिनेगॉग (लंदन में भी) से आगे निकल गया। समुदाय ने जवाब में 2006 में लंदन यहूदी फोरम की स्थापना की। विकसित लंदन सरकार के बढ़ते महत्व के लिए।",546,2006,0,"{'answer_start': [546], 'text': ['2006']}"


In [10]:
# Validation data
language = 'hi'
input_file_path = f'./drive/MyDrive/Colab Files/squad.translate.dev.en-{language}.json'
record_path = ['data','paragraphs','qas','answers']
dev = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path,verbose=0)
dev['answers'] = dev[['answer_start', 'text']].apply(get_answers, axis=1)
pd.set_option('display.max_colwidth',None)
dev.head(1)

Unnamed: 0,index,question,context,answer_start,text,c_id,answers
0,56e1c9bfe3433e1400423196,बहुपद समय में कमी क्या इसका एक उदाहरण है?,"कमी की अवधारणा का उपयोग करके कई जटिलता वर्गों को परिभाषित किया गया है। एक कमी एक समस्या का दूसरी समस्या में परिवर्तन है। यह एक समस्या की अनौपचारिक धारणा को कम से कम एक और समस्या के रूप में मुश्किल बनाता है। उदाहरण के लिए, यदि कोई समस्या Y के लिए एल्गोरिथ्म का उपयोग करके X को हल किया जा सकता है, तो X, Y से अधिक कठिन नहीं है, और हम कहते हैं कि X, Y को कम कर देता है। कई अलग-अलग प्रकार के कटौती हैं, जिनके आधार पर कटौती की विधि, जैसे कि कुक रिडक्शन, कार्प रिडक्शन और लेविन रिडक्शन, और पोलिनेशन-टाइम रिडक्शन या लॉग-स्पेस रिडक्शन जैसी कटौती की जटिलता पर बाध्य होती है।",378,प्रकार के कटौती,0,"{'answer_start': [378], 'text': ['प्रकार के कटौती']}"


In [25]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(dev)


squad_hi = DatasetDict()

squad_hi['train'] = tds
squad_hi['validation'] = vds

In [26]:
squad_hi

DatasetDict({
    train: Dataset({
        features: ['index', 'question', 'context', 'answer_start', 'text', 'c_id', 'answers'],
        num_rows: 85804
    })
    validation: Dataset({
        features: ['index', 'question', 'context', 'answer_start', 'text', 'c_id', 'answers'],
        num_rows: 34111
    })
})

In [27]:
# squad_en = load_dataset("squad")
# squad_en

## Create QA Features

In [28]:

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
assert isinstance(tokenizer, PreTrainedTokenizerFast)
pad_on_right = tokenizer.padding_side == "right"

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/xlm-roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/87683eb92ea383b0475fecf99970e950a03c9ff5e51648d6eee56fb754612465.dfaaaedc7c1c475302398f09706cbb21e23951b73c6e2b3162c1c8a99bb3b62a
Model config XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.1

In [29]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["index"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [30]:
# squad = squad.map(
#     prepare_train_features, batched=True, remove_columns=squad["train"].column_names
# )

# squad_en = squad_en.map(
#     prepare_train_features, batched=True, remove_columns=squad_en["train"].column_names
# )

squad_hi = squad_hi.map(
    prepare_train_features, batched=True, remove_columns=squad_hi["train"].column_names
)


  0%|          | 0/86 [00:00<?, ?ba/s]

  0%|          | 0/35 [00:00<?, ?ba/s]

## Load Model and Adapters

### Enable EN only for training

In [31]:
model = AutoModelForQuestionAnswering.from_pretrained(f'./drive/MyDrive/Colab Files/models/{language}-adapter-en-trained')
config = AdapterConfig.load("pfeiffer",non_linearity="relu", reduction_factor=2)
adapter_name_1 = model.load_adapter(f'./drive/MyDrive/Colab Files/models/{language}-adapter-en-trained/en', config=config,model_name='xlm-roberta-base')
adapter_name_2 = model.load_adapter(f'./drive/MyDrive/Colab Files/models/{language}-adapter-en-trained/hi',config=config)
task_adapter = model.load_adapter("AdapterHub/roberta-base-pf-squad", source="hf",load_as = 'pfeiffer_xlm_base')
model.set_active_adapters(adapter_name_2,task_adapter)
model.train_adapter('pfeiffer_xlm_base')

loading configuration file ./drive/MyDrive/Colab Files/models/hi-adapter-en-trained/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "./drive/MyDrive/Colab Files/models/hi-adapter-en-trained",
  "adapters": {
    "adapters": {
      "en": "16eaa0b5fae9ed68",
      "hi": "16eaa0b5fae9ed68",
      "pfeiffer_xlm_base": "9076f36a74755ac4"
    },
    "config_map": {
      "16eaa0b5fae9ed68": {
        "adapter_residual_before_ln": false,
        "cross_adapter": false,
        "factorized_phm_W": true,
        "factorized_phm_rule": false,
        "hypercomplex_nonlinearity": "glorot-uniform",
        "init_weights": "bert",
        "inv_adapter": "nice",
        "inv_adapter_reduction_factor": 2,
        "is_parallel": false,
        "learn_phm": true,
        "leave_out": [],
        "ln_after": false,
        "ln_before": false,
        "mh_adapter": false,
        "non_linearity": "relu",
        "original_ln_after": true,
        "original_ln_before": true,
        "outpu

## Training



In [32]:
batch_size = 32

In [33]:
args = TrainingArguments(
    f"./{language}-adapter-{batch_size}",
    evaluation_strategy = "epoch",
    save_strategy= "epoch",
    learning_rate = 3e-5,
    warmup_ratio = 0.1,
    gradient_accumulation_steps = 8,
    num_train_epochs = 5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    seed=42,
    fp16=True,
    overwrite_output_dir=True,
    save_total_limit=1,
    load_best_model_at_end=True,
    push_to_hub=True)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
data_collator = default_data_collator

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=squad_hi["train"],
    eval_dataset=squad_hi["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

/content/./hi-adapter-32 is already a clone of https://huggingface.co/subhasisj/hi-adapter-32. Make sure you pull the latest changes with `repo.git_pull()`.
Using amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 91554
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 1785


Epoch,Training Loss,Validation Loss


In [None]:
trainer.model

In [None]:
# trainer.save_model(f"./{language}-adapter-{batch_size}")

In [None]:
# model.save_adapter('./hi-adapter-16/runs',"pfeiffer_xlm_base")

In [None]:
# import os
# os.makedirs('./hi-adapter-16/all')

In [None]:
# model.save_adapter('./hi-adapter-16/en-adapter',"en")

In [None]:
model.save_all_adapters(f'./drive/MyDrive/Colab Files/models/{language}-adapter-en-trained/all')

In [None]:
# %%capture
# wandb.init(mode="disabled")
# wandb.init(mode="offline")

In [None]:
# !zip -r   hi-adapters-16.zip  /kaggle/working/hi-adapter-16/all

In [None]:
# import os
# os.chdir(r'/kaggle/working')
# from IPython.display import FileLink

# FileLink(r'./hi-adapters-16.zip')

In [None]:
# !zip -r   hi-model-checkpoint-16.zip  /kaggle/working/hi-adapter-16/checkpoint-3500

In [None]:
# FileLink(r'./hi-model-checkpoint-16.zip')