In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow_datasets as tfds
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

import unicodedata
import re
import time

from transformers import TransfoXLConfig, TFTransfoXLModel

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/chaii-hindi-and-tamil-question-answering/train.csv')
print(len(train_data))
train_data.head()

In [None]:
train_data.dropna(axis=0,how='any',inplace=True)
print(len(train_data))

In [None]:
def _unicode_to_ascii(s):
    """将文本从unicode转ascii

    Parameters
    ----------
    s : str
        输入文本
    Returns
    -------
    s : str
        处理后的文本
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
train_data['context'] = train_data.apply(lambda x: _unicode_to_ascii(x['context']),axis=1)
train_data['question'] = train_data.apply(lambda x: _unicode_to_ascii(x['question']),axis=1)
train_data['answer_text'] = train_data.apply(lambda x: _unicode_to_ascii(x['answer_text']),axis=1)

### language类别情况

In [None]:
train_data['language'].value_counts()

In [None]:
def include(str1, str2):
    a = set(str(str1).lower().split())
    b = set(str(str2).lower().split())
    c = a.intersection(b)
    return round(float(len(c)) / len(b), 4)

question与context的交集除以question

In [None]:
context_question_similar = []
for index, row in train_data.iterrows():
    sentence1 = row.context
    sentence2 = row.question
    score = include(sentence1, sentence2)
    context_question_similar.append([sentence1, sentence2, score])
    
context_question_similar = pd.DataFrame(context_question_similar, columns=['context', 'question', 'score'])
context_question_similar = context_question_similar.sort_values(by='score', ascending=False)
f, ax = plt.subplots(figsize=(6, 15))
sns.set_color_codes("pastel")
sns.countplot(y="score",data=context_question_similar, color="b")

answer_text与context的交集除以answer_text

In [None]:
context_answer_text_similar = []
for index, row in train_data.iterrows():
    sentence1 = row.context
    sentence2 = row.answer_text
    score = include(sentence1, sentence2)
    context_answer_text_similar.append([sentence1, sentence2, score])
    
context_answer_text_similar = pd.DataFrame(context_answer_text_similar, columns=['context', 'answer_text', 'score'])
context_answer_text_similar['score'].value_counts()

### 句子预处理

In [None]:
train_data['context_include_answertext'] = train_data.apply(lambda x: include(x['context'], x['answer_text'])==1,axis=1)
train_data = train_data[train_data['context_include_answertext']==True]

len(train_data)

context句子长度分布

In [None]:
index_len = []
count = 0
for index, row in train_data.iterrows():
    count += 1
    index_len.append([count, len(str(row['context']).split())])
    
index_len = pd.DataFrame(index_len, columns=['index', 'len'])
index_len = index_len.sort_values(by='len', ascending=False)

f, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(x="index", y="len", data=index_len)

answer_text句子长度分布

In [None]:
index_len = []
count = 0
for index, row in train_data.iterrows():
    count +=1
    index_len.append([count, len(str(row['answer_text']).split())])
    
index_len = pd.DataFrame(index_len, columns=['index', 'len'])
index_len = index_len.sort_values(by='len', ascending=False)

f, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(x="index", y="len", data=index_len)

# 数据重构

In [None]:
restructure_data = []
for index, row in train_data.iterrows():
    contexts = re.split(r'(\n|;|\.)', row['context'])

    true_flag = False
    false_count = 0
    for context in contexts:
        if re.search(r'^\s+$', context) is not None or len(context) < len(row['answer_text']):
            continue
        result = context.find(row['answer_text'])
        
        if result != -1 and true_flag == False:
            score = 1
            answer_start = result
            answer_end = result + len(row['answer_text'])
            restructure_data.append([context, row['question'], row['answer_text'], answer_start, answer_end, score,row['language']])
            true_flag = True
#         elif true_flag:
#             break
        elif false_count<3:
            false_count += 1
            score = 0
            answer_start = 0
            answer_end = 0
            restructure_data.append([context, row['question'], row['answer_text'], answer_start, answer_end, score,row['language']])
        elif false_count>=5 and true_flag:
            break
        
restructure_data = pd.DataFrame(restructure_data, columns=['context', 'question', 'answer_text', 'answer_start', 'answer_end', 'score','language'])


In [None]:
len(restructure_data)

In [None]:
restructure_data.head()

In [None]:
restructure_data['context_len'] = restructure_data.apply(lambda x: len(str(x['context']).split()) < 140,axis=1)
restructure_data = restructure_data[restructure_data['context_len']==True]

len(restructure_data)

In [None]:
index_len = []
count = 0
for index, row in restructure_data.iterrows():
    count += 1
    index_len.append([count, len(str(row['context']).split())])
    
index_len = pd.DataFrame(index_len, columns=['index', 'len'])
index_len = index_len.sort_values(by='len', ascending=False)

f, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(x="index", y="len", data=index_len[:1000])

### tokenize

In [None]:
metadata = np.concatenate((train_data['context'],train_data['question'],train_data['answer_text']),axis=0)
print(np.shape(metadata))

tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(metadata, target_vocab_size=2**13)
# sample_string = metadata[0]
# print ('Tokenized sample_string is {}'.format(sample_string))
# tokenized_string = tokenizer.encode(sample_string)
# print ('Tokenized string is {}'.format(tokenized_string))

In [None]:
# bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2",trainable=True)
# vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
# tokenizer = bert.tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
def encode_sentence(s, tokenizer):
    tokens = tokenizer.encode(s)
    tokens.append(tokenizer.vocab_size+2)
    return tokens
#     tokens = list(tokenizer.tokenize(s))
#     tokens.append('[SEP]')
#     return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(glue_dict, tokenizer):
    num_examples = len(glue_dict["context"])

    sentence1 = tf.ragged.constant([
        encode_sentence(s, tokenizer) for s in np.array(glue_dict["context"])])
    sentence2 = tf.ragged.constant([
        encode_sentence(s, tokenizer) for s in np.array(glue_dict["question"])])

    cls = [[tokenizer.vocab_size]]*sentence1.shape[0]
#     cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]

    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()
    
    
    inputs = {
        'input_word_ids': tf.cast(input_word_ids.to_tensor(), dtype=tf.int64),
        'input_mask': tf.cast(input_mask, dtype=tf.int64),
        'input_type_ids': tf.cast(input_type_ids, dtype=tf.int64)}

    return inputs

def bert_encode_label(glue_dict):
    answer_start = np.expand_dims(glue_dict['answer_start'], axis=-1)
    answer_end = np.expand_dims(glue_dict['answer_end'], axis=-1)
    score = np.expand_dims(glue_dict['score'], axis=-1)
    
    labels = tf.concat([answer_start, answer_end, score], axis=-1)
    labels = tf.cast( labels, dtype=tf.int64)
    return labels
        
input_shape = ()
glue_train = bert_encode(restructure_data, tokenizer)
for key, value in glue_train.items():
    input_shape = value.shape
    print(f'{key:15s} shape: {value.shape}')
    
glue_train_labels = bert_encode_label(restructure_data)
print(f'glue_train_labels shape: {glue_train_labels.shape}')

In [None]:
BUFFER_SIZE = 40000
BATCH_SIZE = 2

train_dataset = tf.data.Dataset.from_tensor_slices((glue_train, glue_train_labels))

train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

train_inp, train_tar = next(iter(train_dataset))
train_inp, train_tar

In [None]:
class TransfoXLQA(tf.keras.Model):
    def __init__(self, input_shape):
        super(TransfoXLQA, self).__init__()
        configuration = TransfoXLConfig()
        self.TransfoXL = TFTransfoXLModel(configuration)
        self.gapool1d = tf.keras.layers.GlobalAveragePooling1D()
        
        self.start = tf.keras.layers.Dense(input_shape[-1])
        self.end = tf.keras.layers.Dense(input_shape[-1])
        self.score = tf.keras.layers.Dense(input_shape[-1])
        
    def call(self, input_word_ids, input_mask, input_type_ids):
        
        outputs = self.TransfoXL({'input_ids': input_word_ids}, training=True)
        last_hidden_states = outputs.last_hidden_state
        pool_out = self.gapool1d(last_hidden_states)
        start = self.start(pool_out)
        end = self.end(pool_out)
        score = self.score(pool_out)
        
        output = tf.concat([start[:,tf.newaxis,:], end[:,tf.newaxis,:], score[:,tf.newaxis,:]], axis=1)
        return output

In [None]:
transfoXLQA = TransfoXLQA(input_shape)

# Set up epochs and steps
epochs = 3

# creates an optimizer with learning rate schedule
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(1024)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,  epsilon=1e-9)


loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [None]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64)
]

@tf.function(input_signature=train_step_signature)
def train_step(input_word_ids, input_mask, input_type_ids, tar):
    with tf.GradientTape() as tape:
        predictions = transfoXLQA(input_word_ids, input_mask, input_type_ids)
        loss = loss_fn(tar[:,2], predictions[:,2,:])
    gradients = tape.gradient(loss, transfoXLQA.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transfoXLQA.trainable_variables))
    
    train_loss(loss)
    train_accuracy(tar[:,2], predictions[:,2,:])

In [None]:
for epoch in range(epochs):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (inp, tar)) in enumerate(train_dataset):
        input_word_ids = inp['input_word_ids']
        input_mask = inp['input_mask']
        input_type_ids = inp['input_type_ids']
        
        train_step(input_word_ids, input_mask, input_type_ids, tar)

        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result()} Accuracy {train_accuracy.result()}')
    
    print(f'Epoch {epoch + 1} Loss {train_loss.result()} Accuracy {train_accuracy.result()}')
    print(f'Time taken for 1 epoch: {time.time() - start} secs\n')