In [None]:
from IPython.core.magic import register_cell_magic
import os
from pathlib import Path

## define custom magic to save most useful classes and use them in inference notebook 
## instead of copying the code every time you have changes in the classes
@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)
    
Path('/kaggle/working/scripts').mkdir(exist_ok=True)


In [None]:
import os
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim

import matplotlib.pyplot as plt
import transformers
from transformers import (
    WEIGHTS_NAME,
    AutoConfig,
    AutoTokenizer,

)

import pytorch_lightning as pl
from tqdm import tqdm
tqdm.pandas()
import pickle 

class Config:
    seed = 42
    model_type = "deepset/xlm-roberta-base-squad2"
    max_seq_length = 384
    doc_stride = 128
    
pl.utilities.seed.seed_everything(Config.seed, workers=True)


In [None]:
def create_folds(data, num_splits):
    data["kfold"] = -1
    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=Config.seed)
    for fold, (_, val_ids) in enumerate(kf.split(X=data, y=data['language'])):
        data.loc[val_ids, 'kfold'] = fold
    return data


import re
def find_all_substring_positions(string, substring):
    return [m.start() for m in re.finditer(re.escape(substring), string)]

def convert_answers(row):
    return {'answer_start': [row[0]], 'text': [row[1]]}


##FINETUNE 
train = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
external_mlqa = pd.read_csv('../input/mlqa-hindi-processed/mlqa_hindi.csv')
external_xquad = pd.read_csv('../input/mlqa-hindi-processed/xquad.csv')
external_train = pd.concat([external_mlqa, external_xquad])

train = create_folds(train, num_splits=5)
external_train["kfold"] = -1
train = pd.concat([train, external_train]).reset_index().drop(['id', 'index'], axis=1).reset_index()
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)





In [None]:
inds_to_remove = []
for i, row in train.iterrows():
    if not isinstance(row['answers']['text'][0], str):
        inds_to_remove.append(i)
train = train.drop(inds_to_remove)

In [None]:
%%write_and_run scripts/preprocess.py

class FeatureExtractor:
    OUT_COLUMNS = ['index', 'input_ids', 'attention_mask', 'offset_mapping', 'sequence_ids']
    TRAIN_SPECIFIC_COLUMNS = ['start_position', 'end_position', 'kfold']
   
    
    def __init__(self, tokenizer, cls_token, test=False):
        self.tokenizer = tokenizer
        self.test = test
        self.columns = self.OUT_COLUMNS + [self.TRAIN_SPECIFIC_COLUMNS, []][self.test==True]
#         self.cls_token = '<CLS>'
        self.cls_token = cls_token
        self.cls_token_id = self.tokenizer.encode(self.cls_token)[0]
        
    def extract_features(self, df):
        features = []
        for i in tqdm(range(len(df))):
            row = df.iloc[i]
            tokenized_sample = self.tokenizer(
                row["question"].strip(),
                row["context"],
                truncation="only_second",
                max_length=Config.max_seq_length,
                stride=Config.doc_stride,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                padding="max_length",
            )


            sample_mapping = tokenized_sample.pop("overflow_to_sample_mapping")
            offset_mapping = tokenized_sample.pop("offset_mapping")
            
            answers = row["answers"]
            if not isinstance(answers["text"][0], str):
                continue

            for i, offsets in enumerate(offset_mapping):
                feature = {}

                input_ids = tokenized_sample["input_ids"][i]
                attention_mask = tokenized_sample["attention_mask"][i]
                feature['index'] = row['index']
                feature['input_ids'] = input_ids
                feature['attention_mask'] = attention_mask
                feature['offset_mapping'] = offsets
                feature['sequence_ids'] = [0 if i is None else i for i in tokenized_sample.sequence_ids(i)]
                if not self.test:
                    feature['kfold'] = row['kfold']
                    cls_index = input_ids.index(self.cls_token_id)
                    sequence_ids = tokenized_sample.sequence_ids(i)

                    sample_index = sample_mapping[i]
                    answers = row["answers"]

                    if len(answers["answer_start"]) == 0:
                        feature["start_position"] = cls_index
                        feature["end_position"] = cls_index
                    else:
                        start_char = answers["answer_start"][0]

                        end_char = start_char + len(answers["text"][0])

                        token_start_index = 0
                        while sequence_ids[token_start_index] != 1:
                            token_start_index += 1

                        token_end_index = len(input_ids) - 1
                        while sequence_ids[token_end_index] != 1:
                            token_end_index -= 1

                        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                            feature["start_position"] = cls_index
                            feature["end_position"] = cls_index
                        else:
                            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                                token_start_index += 1
                            feature["start_position"] = token_start_index - 1
                            while offsets[token_end_index][1] >= end_char:
                                token_end_index -= 1
                            feature["end_position"] = token_end_index + 1

                features.append(feature)

        
        features_df = pd.DataFrame(data=features, columns=self.columns)
        return features_df

In [None]:
%%time
tokenizer = AutoTokenizer.from_pretrained(Config.model_type)
torch.save(tokenizer, f"{Config.model_type.split('/')[-1]}_tokenizer.pt")
feat_extractor = FeatureExtractor(tokenizer, tokenizer.cls_token)
train.to_csv('kfold_raw.csv', index=False)

features_df = feat_extractor.extract_features(train)
data_to_save = (features_df.columns, features_df.values)
with open(f"{Config.model_type.split('/')[-1]}_features_kfold.pkl", 'wb') as f:
    pickle.dump(data_to_save, f, protocol=4)

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(Config.model_type)
# tokenizer.add_tokens(['<CLS>'])
# print(tokenizer.encode('<CLS>'), tokenizer.decode([48044]))
# print(tokenizer.encode('<CLS> नियम-निष्ठता '))

In [None]:
# import pickle
# import pandas as pd
# with open('./xlm-roberta-base-squad2_features_kfold.pkl_SHORT', 'rb') as f:
#     feat_data = pickle.load(f)
    
# f_df = pd.DataFrame(data=feat_data[1], columns=feat_data[0])
# f_df.head(30)