In [5]:
!nvidia-smi

In [1]:
!pip install datasets adapter-transformers

In [3]:
pip install --upgrade --force-reinstall pyarrow

## Imports

In [5]:
from datasets import Dataset, DatasetDict, load_dataset
import warnings
import numpy as np
import pandas as pd
import json
from transformers.adapters.composition import Fuse
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoModelWithHeads,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    default_data_collator,
    PreTrainedTokenizerFast,
    AutoAdapterModel,
    AdapterConfig,
    AutoModelForQuestionAnswering,
)
from transformers import AdapterConfig
from transformers.adapters.composition import Stack

warnings.filterwarnings("ignore")
import wandb
wandb.init(mode="disabled")
wandb.init(mode="offline")

## Load Dataset

In [6]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [7]:
# training data
language = 'hi'
input_file_path = f'../input/squadenhi/squad.translate.train.en-{language}.json'
record_path = ['data','paragraphs','qas','answers']
train = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path,verbose=0)

def get_answers(x):
    start = x[0]
    text = x[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train['answers'] = train[['answer_start', 'text']].apply(get_answers, axis=1)
pd.set_option('display.max_colwidth',None)
train.head(1)

In [8]:
# Validation data
language = 'hi'
input_file_path = f'../input/squadenhi/squad.translate.dev.en-{language}.json'
record_path = ['data','paragraphs','qas','answers']
dev = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path,verbose=0)
dev['answers'] = dev[['answer_start', 'text']].apply(get_answers, axis=1)
pd.set_option('display.max_colwidth',None)
dev.head(1)

In [9]:
tds = Dataset.from_pandas(train.sample(100))
vds = Dataset.from_pandas(dev.sample(100))


squad_hi = DatasetDict()

squad_hi['train'] = tds
squad_hi['validation'] = vds

In [10]:
squad_hi

In [11]:
squad_en = load_dataset("squad")
squad_en

## Create QA Features

In [12]:

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
assert isinstance(tokenizer, PreTrainedTokenizerFast)
pad_on_right = tokenizer.padding_side == "right"

In [13]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["index"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [14]:
# squad = squad.map(
#     prepare_train_features, batched=True, remove_columns=squad["train"].column_names
# )

squad_en = squad_en.map(
    prepare_train_features, batched=True, remove_columns=squad_en["train"].column_names
)

squad_hi = squad_hi.map(
    prepare_train_features, batched=True, remove_columns=squad_hi["train"].column_names
)


## Load Model and Adapters

### Enable EN only for training

In [15]:
model = AutoModelForQuestionAnswering.from_pretrained('xlm-roberta-base')
config = AdapterConfig.load("pfeiffer",non_linearity="relu", reduction_factor=2)
adapter_name_1 = model.load_adapter("en/wiki@ukp", config=config,model_name='xlm-roberta-base')
adapter_name_2 = model.load_adapter("hi/wiki@ukp",config=config)
task_adapter = model.load_adapter("AdapterHub/roberta-base-pf-squad", source="hf",load_as = 'pfeiffer_xlm_base')
model.set_active_adapters(adapter_name_1,task_adapter)
model.train_adapter('pfeiffer_xlm_base')

## Training



In [16]:
batch_size = 16

In [17]:
args = TrainingArguments(
    f"./{language}-adapter-{batch_size}",
    evaluation_strategy = "epoch",
    save_strategy= "epoch",
    learning_rate = 3e-5,
    warmup_ratio = 0.1,
    gradient_accumulation_steps = 8,
    num_train_epochs = 5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    seed=42,
    fp16=True,
    overwrite_output_dir=True,
    save_total_limit=1,
    load_best_model_at_end=True)

In [18]:
data_collator = default_data_collator

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=squad_en["train"],
    eval_dataset=squad_en["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [20]:
trainer.train()

In [31]:
trainer.model

In [22]:
trainer.save_model(f"./{language}-adapter-{batch_size}")

In [23]:
model.save_adapter('./hi-adapter-16/runs',"pfeiffer_xlm_base")

In [24]:
import os
os.makedirs('./hi-adapter-16/all')

In [25]:
model.save_adapter('./hi-adapter-16/en-adapter',"en")

In [26]:
model.save_all_adapters("./hi-adapter-16/all")

In [27]:
%%capture
wandb.init(mode="disabled")
wandb.init(mode="offline")

In [32]:
!zip -r   hi-adapters-16.zip  /kaggle/working/hi-adapter-16/all

In [33]:
import os
os.chdir(r'/kaggle/working')
from IPython.display import FileLink

FileLink(r'./hi-adapters-16.zip')

In [34]:
!zip -r   hi-model-checkpoint-16.zip  /kaggle/working/hi-adapter-16/checkpoint-3500

In [35]:
FileLink(r'./hi-model-checkpoint-16.zip')