We wants to solve open-domain QA task.

My process is as follows:
#### 0. [Orientation](https://www.kaggle.com/adldotori/notebook-to-read-before-start-nlp-step-0/)
#### 1. [Tokenization](https://www.kaggle.com/adldotori/tokenizing-hindi-and-tamil-language-nlp-step-1)
   * ver 1 : init (2021/10/03)
   * ver 2 : change transformer version (2021/10/03)
   * ver 3 : update description (2021/10/05)

#### 2. [Demo](https://www.kaggle.com/adldotori/demo-training-nlp-step-2/)
#### 3. Research QA Model
#### 4. Training
#### 5. Inference

In [None]:
!pip3 install transformers==4.11.2

# Save context depending on language

In [None]:
import os
import os.path as osp

import pandas as pd

In [None]:
INPUT_PATH = '../input/chaii-hindi-and-tamil-question-answering/'

In [None]:
train = pd.read_csv(osp.join(INPUT_PATH, 'train.csv'))
test = pd.read_csv(osp.join(INPUT_PATH, 'test.csv'))
sub = pd.read_csv(osp.join(INPUT_PATH, 'sample_submission.csv'))

In [None]:
train[train.language == 'tamil'].head()

In [None]:
train[train.language == 'hindi'].head()

In [None]:
tamil_context = train[train.language == 'tamil']['context'].str.cat(sep='\n')
hindi_context = train[train.language == 'hindi']['context'].str.cat(sep='\n')

In [None]:
print(
    '\nlength of tamil characters : ', len(set(tamil_context)),
    '\nlength of hindi characters : ', len(set(hindi_context)),
    '\nlength of hindi & tamil characters : ', len(set(tamil_context) & set(hindi_context)),
    '\nlength of only tamil characters : ', len(set(tamil_context) - set(hindi_context)),
    '\nlength of only hindi characters : ', len(set(hindi_context) - set(tamil_context))
)

Since only 700 characters out of a total of 1400 characters overlap, two languages are separated during train tokenizers.

In [None]:
with open("tamil.txt", "w") as f:
    print(tamil_context, file=f)

with open("hindi.txt", "w") as f:
    print(hindi_context, file=f)

# Train BertWordPieceTokenizer

In [None]:
from tokenizers import BertWordPieceTokenizer

os.makedirs('vocab', exist_ok=True)

def train_tokenizer(language: str):
    print(f'>>> Training {language}...')
    tokenizer = BertWordPieceTokenizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=False, # Must be False if cased model
        lowercase=False,
        wordpieces_prefix="##"
    )

    tokenizer.train(
        files=[f'{language}.txt'],
        limit_alphabet=6000,
        min_frequency=5,
        show_progress=True,
        vocab_size=30000
    )

    tokenizer.save(f'vocab/{language}', True)

In [None]:
train_tokenizer('tamil')
train_tokenizer('hindi')

In [None]:
import json

def save_vocab(language : str):
    vocab_path = f'vocab/{language}'

    vocab_txt_path = f'vocab/{language}.txt'

    f = open(vocab_txt_path, 'w' ,encoding='utf-8')

    with open(vocab_path) as json_file:
        json_data = json.load(json_file)

        for item in json_data["model"]["vocab"].keys():
            f.write(item + '\n')
    
        f.close()
    
    print(f'{language} token Example:\n')
    for i,j in list(json_data['model']['vocab'].items())[3000:3020]:
        print(f'{i} => {j}')
    print('\n')

In [None]:
save_vocab("tamil")
save_vocab("hindi")

# Test BertTokenizer

In [None]:
from transformers import BertTokenizer

def test_tokenizer(
    language : str, 
    sample : int = 2
):
    """
    sample : sample count of testset
    """
    print(f'{language} testing...\n')
    vocab_txt_path = f"vocab/{language}.txt"

    tokenizer = BertTokenizer(vocab_file=vocab_txt_path, do_lower_case=False)

    for i in range(sample):
        test_str = train[train.language == language].iloc[i+100]['context']
        test_str = test_str[:test_str.find('\n') + 1]

        print(f'{i+1}th  Test Sentence: ',test_str)

        encoded_str = tokenizer.encode(test_str,add_special_tokens=False)
        print(f'{i+1}th Sentence Encoding: ',encoded_str)

        decoded_str = tokenizer.decode(encoded_str)
        print(f'{i+1}th  Sentence Decoding: ',decoded_str, '\n')
    
    print()
    return tokenizer

In [None]:
tamil_tokenizer = test_tokenizer('tamil')
hindi_tokenizer = test_tokenizer('hindi')

# Save Checkpoints 

In [None]:
os.makedirs('tamil_checkpoint', exist_ok=True)
tamil_tokenizer.save_pretrained('tamil_checkpoint')

In [None]:
os.makedirs('hindi_checkpoint', exist_ok=True)
hindi_tokenizer.save_pretrained('hindi_checkpoint')

# RESULT

In [None]:
from transformers import BertTokenizer

def test_tokenizer_from_pretrained(
    language : str, 
    sample : int = 2
):
    """
    sample : sample count of testset
    """
    print(f'{language} testing...\n')
    vocab_txt_path = f"vocab/{language}.txt"

    tokenizer = BertTokenizer.from_pretrained(f'./{language}_checkpoint')

    for i in range(sample):
        test_str = train[train.language == language].iloc[i+100]['context']
        test_str = test_str[:test_str.find('\n') + 1]

        print(f'{i+1}th  Test Sentence: ',test_str)

        encoded_str = tokenizer.encode(test_str,add_special_tokens=False)
        print(f'{i+1}th Sentence Encoding: ',encoded_str)

        decoded_str = tokenizer.decode(encoded_str)
        print(f'{i+1}th  Sentence Decoding: ',decoded_str, '\n')
    
    print()

In [None]:
test_tokenizer_from_pretrained('tamil')
test_tokenizer_from_pretrained('hindi')

We will use this tokenizer in the same way as above. Now let's do QA training using this tokenizer on the next notebook.