# ChAII train with divided Hindi and Tamil


I interest in that whether I have to train with divide Hindi and Tamil. (I have made [discussion thread](https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering/discussion/264749))

This notebook is training with dividing Hindi and Tamil.

Before sharing this code, I have trained with mix Hindi and Tamil.
The result is that:

- best validation loss: 0.3081
- best jaccard score: 0.5141
- LB score: 0.580 (This is my best.)

It is the same parameters, fold, and random seed.

I'll check which is better, mix two languages or divide.

install and update libraries

In [None]:
!pip install -U torch wandb transformers

set wandb to record the log during training

In [None]:
from kaggle_secrets import UserSecretsClient
secret_label = "wandb"
secret_value = UserSecretsClient().get_secret(secret_label)
!wandb login $secret_value

In [None]:
import json
import math
import numpy as np
import pandas as pd 
import random
import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import wandb

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Sampler, Dataset, DataLoader
import torch.nn.init as init
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction

from transformers import get_cosine_schedule_with_warmup
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AdamW

import shutil
import collections

device = torch.device("cuda")
scaler = torch.cuda.amp.GradScaler()

In [None]:
class config:
    INPUT_DIR = "/kaggle/input/chaii-hindi-and-tamil-question-answering"
    OUTPUT_DIR = "/kaggle/working"
    SEED = 0
    N_FOLDS = 5
    SKIP_FOLD = [1, 2, 3, 4]  # Only fold-0
    MODEL_NAME = "deepset/xlm-roberta-base-squad2"
    # tokenizer
    TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
    NOT_WATCH_PARAM = ["NOT_WATCH_PARAM", "TOKENIZER", "INPUT_DIR", "OUTPUT_DIR"]
    MAX_LEN = 384
    STRIDE = 128
    MAX_ANSWER_LEN = 30
    CONTENT_ID= 1
    N_BEST = 20
    # param
    BS = 16
    LR = 2e-5
    N_EPOCHS = 1
    WARM_UP_RATIO = 0.1
    HIDDEN_DROPOUT_PROB = 0.1
    LAYER_NORM_EPS = 1e-5
    WEIGHT_DECAY = 1e-6
    ACCUMULATE = 2
    EVAL_STEP = 50

In [None]:
def set_seed(seed=config.SEED):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

### Dataloader

In [None]:
# cf: https://www.kaggle.com/thedrcat/chaii-eda-baseline
def encode_train_example(example):
    question = example['question'].lstrip()
    context = example['context']
    answer_text = example["answer_text"]
    answer_start = example["answer_start"]
    answer_end = answer_start + len(answer_text)

    tokenized_examples = config.TOKENIZER(
            question,
            context,
            truncation="only_second",
            max_length=config.MAX_LEN,
            stride=config.STRIDE,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_token_type_ids=True,
            padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    tokenized_examples['sequence_ids'] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(config.TOKENIZER.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offsets[token_start_index][0] <= answer_start and offsets[token_end_index][1] >= answer_end):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= answer_start:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= answer_end:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)
        
        tokenized_examples['sequence_ids'].append(sequence_ids)

    return tokenized_examples


def extract_feature_example(df):
    tokenized_examples = {
        'input_ids': [],
        'attention_mask': [],
        'token_type_ids': [],
        'start_positions': [],
        'end_positions': [],
        'offset_mapping': [],
        'sequence_ids': [],
        'example_id': [],
    }
    for _, row in df.iterrows():
        tokenized_example = encode_train_example(row)
        tokenized_examples['input_ids'].extend(tokenized_example['input_ids'])
        tokenized_examples['attention_mask'].extend(tokenized_example['attention_mask'])
        tokenized_examples['token_type_ids'].extend(tokenized_example['token_type_ids'])
        tokenized_examples['start_positions'].extend(tokenized_example['start_positions'])
        tokenized_examples['end_positions'].extend(tokenized_example['end_positions'])
        tokenized_examples['offset_mapping'].extend(tokenized_example['offset_mapping'])
        tokenized_examples['sequence_ids'].extend(tokenized_example['sequence_ids'])
        tokenized_examples['example_id'].extend([row['id'] for _ in range(len(tokenized_example['input_ids']))])
    return tokenized_examples


class ChAIIDataset(Dataset):
    
    def __init__(self, df, train):
        self.feature_examples = extract_feature_example(df)
        self.train = train
        
    def __len__(self):
        return len(self.feature_examples['input_ids'])
    
    def __getitem__(self, item):
        d = {
            'input_ids': torch.tensor(self.feature_examples['input_ids'][item]),
            'attention_mask': torch.tensor(self.feature_examples['attention_mask'][item]),
            'token_type_ids': torch.tensor(self.feature_examples['token_type_ids'][item]),
            'start_positions': torch.tensor(self.feature_examples['start_positions'][item]),
            'end_positions': torch.tensor(self.feature_examples['end_positions'][item]),
            'offset_mapping': torch.tensor(self.feature_examples['offset_mapping'][item]),
            'example_id': self.feature_examples['example_id'][item],
        }
        if not self.train:
            d['sequence_ids'] = self.feature_examples['sequence_ids'][item]
        return d

### functions for evaluation

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def eval_model(model, dset):
    model.eval()
    all_logits, losses = [], []
    for d in dset:
        with torch.no_grad():
            outputs = model(
                d['input_ids'].unsqueeze(0).to(device),
                d['attention_mask'].unsqueeze(0).to(device),
                d['token_type_ids'].unsqueeze(0).to(device),
                start_positions=d['start_positions'].unsqueeze(0).to(device),
                end_positions=d['end_positions'].unsqueeze(0).to(device)
            )
        loss = outputs['loss'].item()
        start_logits = outputs['start_logits'].cpu()
        end_logits = outputs['end_logits'].cpu()
        losses.append(loss)
        all_logits.append((start_logits, end_logits))
    return all_logits, np.array(losses).mean()


def post_processing(all_logits, train_df, dset):
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(dset):
        features_per_example[feature["example_id"]].append(i)
    
    predicts = []
    for example_id, feature_indices in features_per_example.items():
        row = train_df.query(f'id=="{example_id}"').iloc[0]
        context = row['context']
        predict_answers = []
        for feature_index in feature_indices:
            assert dset.feature_examples['example_id'][feature_index] == example_id
            
            start_logits, end_logits = all_logits[feature_index]
            offset_mapping = dset.feature_examples["offset_mapping"][feature_index]
            sequence_ids = dset.feature_examples["sequence_ids"][feature_index]
            offset_mapping = [o if i == config.CONTENT_ID else None for i, o in zip(sequence_ids, offset_mapping)]
            
            start_indexes = np.argsort(start_logits[0].numpy())[-config.N_BEST:]
            end_indexes = np.argsort(end_logits[0].numpy())[-config.N_BEST:]
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offset_mapping[start_index] is None or offset_mapping[end_index] is None:
                        continue
                    if len(offset_mapping) <= start_index or len(offset_mapping) <= end_index:
                        continue
                    if start_index > end_index or (end_index - start_index) > config.MAX_ANSWER_LEN:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]

                    text = context[start_char:end_char]
                    score = start_logits[0, start_index] + end_logits[0, end_index]

                    predict_answers.append({
                        'predict_text': text,
                        'score': score.item()
                    })
            
        if len(predict_answers) > 0:
            best_answer = sorted(predict_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"predict_text": "", "score": 0.0}
            
        best_answer['id'] = example_id
        predicts.append(best_answer)
    predict_df = pd.DataFrame(predicts)
    return predict_df

### Model

In [None]:
class ChaiiModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer_config = AutoConfig.from_pretrained(config.MODEL_NAME)
        self.transformer_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": config.HIDDEN_DROPOUT_PROB,
                "layer_norm_eps": config.LAYER_NORM_EPS,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_pretrained(config.MODEL_NAME, config=self.transformer_config)
        self.qa_outputs = nn.Linear(self.transformer_config.hidden_size, 2)
        self.__init_weights(self.qa_outputs)
        
    def __init_weights(self,module):
        if isinstance(module,nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.transformer_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        
    def forward(self, input_ids, attention_mask, token_type_ids=None, start_positions=None, end_positions=None):
        transformer_out = self.transformer(
            input_ids,
            attention_mask,
            token_type_ids,
        )
        sequence_output = transformer_out['last_hidden_state']  # 'last_hidden_state', 'pooler_output', 'hidden_states'
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()
        
        if start_positions is not None and end_positions is not None:
            loss = self.loss_fn(start_logits, end_logits, start_positions, end_positions)
        else:
            loss = None

        return {
            'start_logits': start_logits,
            'end_logits': end_logits,
            'loss': loss,
        }
    
    def loss_fn(self, start_logits, end_logits, start_positions, end_positions):
        total_loss = None

        ignored_index = start_logits.size(1)
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)

        loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2
        
        return total_loss

### functions for training with dividing language 

In [None]:
def train_each_lang(exp_no, lang):
    train_df = pd.read_csv(f'{config.INPUT_DIR}/train.csv')
    train_df = train_df.query(f'language=="{lang}"').reset_index(drop=True)
    
    context_len_bins = pd.qcut(train_df['context'].map(len), config.N_FOLDS, labels=range(config.N_FOLDS)).tolist()
    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.SEED)
    for f, (t_, v_) in enumerate(skf.split(X=train_df, y=context_len_bins)):
        train_df.loc[v_, 'kfold'] = f

    for fold in range(config.N_FOLDS):
        if fold in config.SKIP_FOLD:
            continue
        
        train_dataset = ChAIIDataset(train_df.query(f'kfold!={fold}'), train=True)
        valid_dataset = ChAIIDataset(train_df.query(f'kfold=={fold}'), train=False)

        set_seed()
        train_loader = DataLoader(train_dataset, batch_size=config.BS,
                                  pin_memory=True, shuffle=True, drop_last=True, num_workers=os.cpu_count(),
                                  worker_init_fn=lambda x: set_seed())

        model = ChaiiModel()
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=config.LR, weight_decay=config.WEIGHT_DECAY)
        max_train_steps = config.N_EPOCHS * len(train_loader) // config.ACCUMULATE
        warmup_steps = int(max_train_steps * config.WARM_UP_RATIO)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=max_train_steps
        )

        # wandb
        uniqe_exp_name = f"{exp_no}_{lang}_f{fold}"
        wandb.init(project='ChAII', entity='trtd56', name=uniqe_exp_name, group=exp_no)
        wandb_config = wandb.config
        wandb_config.fold = fold
        for k, v in dict(vars(config)).items():
            if k[:2] == "__" or k in config.NOT_WATCH_PARAM:
                continue
            wandb_config[k] = v
        wandb.watch(model)
        os.makedirs(f'{config.OUTPUT_DIR}/{exp_no}_{lang}', exist_ok=True)

        set_seed()
        train_iter_loss, valid_best_loss, jaccard_best_score, step_i = 999, 999, 0, 0
        optimizer.zero_grad()
        bar = tqdm(total=max_train_steps)
        bar.set_description(f'{uniqe_exp_name}')
        for epoch in range(config.N_EPOCHS):
            for d in train_loader:
                step_i += 1
                model.train()
                with torch.cuda.amp.autocast(): 
                    outputs = model(
                        d['input_ids'].to(device),
                        d['attention_mask'].to(device),
                        d['token_type_ids'].to(device),
                        start_positions=d['start_positions'].to(device),
                        end_positions=d['end_positions'].to(device)
                    )
                    loss = outputs['loss'] / config.ACCUMULATE

                step_lr = np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean()
                train_iter_loss += loss.item()

                scaler.scale(loss).backward()
                if step_i % config.ACCUMULATE == 0:
                    scaler.step(optimizer) 
                    scaler.update() 
                    optimizer.zero_grad()
                    scheduler.step()
                    bar.update(1)

                if step_i % config.EVAL_STEP == 0 or step_i == 1 or step_i == len(train_loader):
                    all_logits, valid_loss_avg = eval_model(model, valid_dataset)
                    predict_df = post_processing(all_logits, train_df, valid_dataset)
                    result_df = predict_df.merge(train_df, how='left', on='id')
                    result_df['jaccard'] = result_df.apply(lambda x: jaccard(x['answer_text'], x['predict_text']), axis=1)
                    jaccard_score_avg = result_df['jaccard'].mean()

                    if valid_loss_avg < valid_best_loss:
                        valid_best_loss = valid_loss_avg

                    if jaccard_score_avg > jaccard_best_score:
                        jaccard_best_score = jaccard_score_avg
                        torch.save(model.state_dict(), f"{config.OUTPUT_DIR}/{exp_no}_{lang}/chaii_f{fold}_best_jaccard_model.bin")
                        result_df.to_csv(f"{config.OUTPUT_DIR}/{exp_no}_{lang}/chaii_f{fold}_best_jaccard_result.csv", index=None)

                    wandb.log({
                        "train_loss": train_iter_loss / config.EVAL_STEP,
                        "valid_loss": valid_loss_avg,
                        "valid_best_loss": valid_best_loss,
                        "jaccard_score": jaccard_score_avg,
                        "jaccard_best_score": jaccard_best_score,
                        "learning_rate": step_lr,
                    })
                    train_iter_loss = 0

        wandb.finish()
        
        del model, optimizer, scheduler, loss
        torch.cuda.empty_cache()

In [None]:
def main():
    train_each_lang("exp0016", "tamil")
    train_each_lang("exp0016", "hindi")

In [None]:
if __name__ == "__main__":
    main()