In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preliminary Preparation

___

## Dependent Libraries Import

In [None]:
# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# text-processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords


from transformers import pipeline

## Accelerator Detection

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

# Data Preparation

---

## Data Extraction

In [None]:
train = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv")
print('train set shape: ', train.shape)

test = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
print('Test set shape: ', test.shape)

sample_submission = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv")
print('Sample submission set shape: ', sample_submission.shape)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
sample_submission.head()

# Regular Text Processing

---

## Text Clean 

In [None]:
'''
    Make text lowercase, remove text in square brackets,
    remove links, remove punctuation and remove words containing numbers.
'''
def text_cleaner(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## Word Tokenization

In [None]:
'''
    Cleaning and parsing the text.
'''
def word_tokenizer(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = text_cleaner(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    # remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

## Text Processor

In [None]:
train['tokenized_text'] = train['context'].apply(str).apply(lambda x: word_tokenizer(x))
train['text_len'] = train['tokenized_text'].astype(str).apply(len)
train['text_word_count'] = train['tokenized_text'].apply(lambda x: len(str(x).split()))

train[['context', 'tokenized_text', 'text_len', 'text_word_count']].head(10)

In [None]:
test['tokenized_text'] = test['context'].apply(str).apply(lambda x: word_tokenizer(x))
test['text_len'] = test['tokenized_text'].astype(str).apply(len)
test['text_word_count'] = test['tokenized_text'].apply(lambda x: len(str(x).split()))

test[['context', 'tokenized_text', 'text_len', 'text_word_count']]

# Data Visualization

---

## Pie Chart

In [None]:
train_groupby = train.groupby(by = ['language']).count()

train_groupby

In [None]:
sns.color_palette('pastel')

In [None]:
# create text props
textprops = dict(horizontalalignment = 'center',
                 verticalalignment = 'top',
                 rotation = 0,
                 # rotation_mode = "default",
                 rotation_mode = 'anchor',
                 size = 14,
                 # color = "#81D8D0",
                 color = sns.color_palette('pastel')[-5],)

# create pie chart
plt.figure(figsize=(6, 6))

# configure pie chart
plt.pie(x = train_groupby.id,
        labels = train_groupby.index,
        colors = sns.color_palette('pastel')[2 : 3] + sns.color_palette('pastel')[-1 : ],
        autopct = '%.2f%%',
        # explode = (0.02, 0.02),
        explode = [0.02] * 2,
        startangle = 90,
        pctdistance = 0.4,
        labeldistance = 1.2,
        textprops = textprops,)

# configure pie chart legend
legend = plt.legend(title = 'Distinguish Samples by Language - Pie Chart',
                    title_fontsize = 'x-large',
                    #loc = 'lower center',
                    bbox_to_anchor = (0, -0.15, 0.5, 0.5),
                    labels = ['HINDI', 'TAMIL'],
                    labelcolor = sns.color_palette('pastel')[-5],
                    fontsize = 'large',
                    facecolor = '#F6F8ED',
                    edgecolor = sns.color_palette('pastel')[1],)

# change pie chart legend color
plt.setp(legend.get_title(), color = sns.color_palette('pastel')[3])

# draw circle
centre_circle = plt.Circle((0,0),0.70,fc = 'white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# show pie chart
plt.show()

## Histogram Charts

In [None]:
sns.color_palette('Paired')

In [None]:
plt.figure(figsize = (12, 6))

sns.set_style('whitegrid')

ax = sns.histplot(x = train.text_len.sort_values(ascending = False),
                  bins = 100,
                  hue = 'language',
                  data = train,
                  kde = True,
                  element = 'step',
                  palette = sns.color_palette('Paired')[9 : 10] + sns.color_palette('Paired')[1 : 2],
                  legend = True,)

ax.set(xlabel = 'text length',
       ylabel = 'count',
       title = 'Original Text Length Distribution',)

legend = plt.legend(fontsize = 10,
                    loc = 'upper right',
                    title = 'Distinguish Samples by Language - Histogram Charts',
                    title_fontsize = 12,
                    shadow = True,
                    facecolor = 'white',
                    labels = ['HINDI', 'TAMIL'],
                    labelcolor = sns.color_palette('Paired')[0],)

plt.setp(legend.get_title(),
         color = sns.color_palette('Paired')[-4],)

plt.show()

In [None]:
ax = plt.figure(figsize = (12, 6))

sns.set_style('darkgrid')

ax = sns.histplot(x = train.text_word_count.sort_values(ascending = False),
                  bins = 100,
                  hue = 'language',
                  data = train,
                  kde = True,
                  element = 'step',
                  palette = sns.color_palette('Paired')[5 : 6] + sns.color_palette('Paired')[7 : 8],)

ax.set(xlabel = 'tokenized text length',
       ylabel = 'count',
       title = 'Tokenize Text Length Distribution',)

legend = plt.legend(fontsize = 10,
                    loc = 'upper right',
                    title = 'Distinguish Samples by Language - Histogram Charts',
                    title_fontsize = 12,
                    shadow = True,
                    facecolor = 'white',
                    labels = ['HINDI', 'TAMIL'],
                    labelcolor = sns.color_palette('Paired')[4],)

plt.setp(legend.get_title(),
         color = sns.color_palette('Paired')[6],)

plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20, 5), sharey = True)
fig.suptitle('Text Length Distribution')

# Original Text
sns.histplot(ax = axes[0],
             x = train.text_len.sort_values(ascending = False),
             bins = 100,
             hue = 'language',
             data = train,
             kde = True,
             element = 'step',
             palette = sns.color_palette('Paired')[9 : 10] + sns.color_palette('Paired')[1 : 2],
             legend = True,)

axes[0].set(xlabel = 'text length',
       ylabel = 'count',
       title = 'Original Text Length Distribution',)

legend_1 = axes[0].legend(fontsize = 10,
                          loc = 'upper right',
                          title = 'Distinguish Samples by Language - Histogram Charts',
                          title_fontsize = 12,
                          shadow = True,
                          facecolor = 'white',
                          labels = ['HINDI', 'TAMIL'],
                          labelcolor = sns.color_palette('Paired')[0],)

plt.setp(legend_1.get_title(),
         color = sns.color_palette('Paired')[-4],)



# Tokenize Text
sns.histplot(ax = axes[1],
             x = train.text_word_count.sort_values(ascending = False),
             bins = 100,
             hue = 'language',
             data = train,
             kde = True,
             element = 'step',
             palette = sns.color_palette('Paired')[5 : 6] + sns.color_palette('Paired')[7 : 8],)
axes[1].set(xlabel = 'tokenized text length',
            ylabel = 'count',
            title = 'Tokenize Text Length Distribution',)

legend_2 = axes[1].legend(fontsize = 10,
                          loc = 'upper right',
                          title = 'Distinguish Samples by Language - Histogram Charts',
                          title_fontsize = 12,
                          shadow = True,
                          facecolor = 'white',
                          labels = ['HINDI', 'TAMIL'],
                          labelcolor = sns.color_palette('Paired')[4],)

plt.setp(legend_2.get_title(),
         color = sns.color_palette('Paired')[6],)

plt.show()

In [None]:
fig = plt.figure(figsize = (15, 12))
title = fig.suptitle(
    'Text Length Distribution',
    fontsize = 'xx-large',
    weight = 'heavy',
    color = sns.color_palette('Paired')[-1],)


gs = fig.add_gridspec(2, 1)

# Original Text
with sns.axes_style('whitegrid'):
    ax_0_0 = fig.add_subplot(gs[0, 0])
    sns.histplot(ax = ax_0_0,
                 x = train.text_len.sort_values(ascending = False),
                 bins = 100,
                 hue = 'language',
                 data = train,
                 kde = True,
                 element = 'step',
                 palette = sns.color_palette('Paired')[9 : 10] + sns.color_palette('Paired')[1 : 2],
                 legend = True,)
    
ax_0_0.set(xlabel = 'text length',
           ylabel = 'count',
           title = 'Original Text Length Distribution',)

ax_0_0.set_xlabel(ax_0_0.get_xlabel(),
                  fontweight = 'bold',)

ax_0_0.set_ylabel(ax_0_0.get_ylabel(),
                  fontweight = 'bold',)

ax_0_0.set_title(ax_0_0.get_title(),
                 fontweight = 'bold',
                 fontsize = 'large')

legend_0_0 = ax_0_0.legend(fontsize = 10,
                           loc = 'upper right',
                           title = 'Distinguish Samples by Language - Histogram Charts',
                           title_fontsize = 12,
                           shadow = True,
                           facecolor = 'white',
                           labels = ['HINDI', 'TAMIL'],
                           labelcolor = sns.color_palette('Paired')[0],
                          )

plt.setp(legend_0_0.get_title(),
         color = sns.color_palette('Paired')[-4],)

# Tokenize Text
with sns.axes_style('darkgrid'):
    ax_1_0 = fig.add_subplot(gs[1, 0], sharex = ax_0_0)
    sns.histplot(ax = ax_1_0,
                 x = train.text_word_count.sort_values(ascending = False),
                 bins = 100,
                 hue = 'language',
                 data = train,
                 kde = True,
                 element = 'step',
                 palette = sns.color_palette('Paired')[5 : 6] + sns.color_palette('Paired')[7 : 8],)

ax_1_0.set(xlabel = 'tokenized text length',
           ylabel = 'count',
           title = 'Tokenize Text Length Distribution',)

ax_1_0.set_xlabel(ax_1_0.get_xlabel(),
                  fontweight = 'bold',)

ax_1_0.set_ylabel(ax_1_0.get_ylabel(),
                  fontweight = 'bold',)

ax_1_0.set_title(ax_1_0.get_title(),
                 fontweight = 'bold',
                 fontsize = 'large')

legend_1_0 = ax_1_0.legend(fontsize = 10,
                           loc = 'upper right',
                           title = 'Distinguish Samples by Language - Histogram Charts',
                           title_fontsize = 12,
                           shadow = True,
                           facecolor = 'white',
                           labels = ['HINDI', 'TAMIL'],
                           labelcolor = sns.color_palette('Paired')[4],)

plt.setp(legend_1_0.get_title(),
         color = sns.color_palette('Paired')[6],)

plt.show()

# Baseline Model Usage

---

## The Most Basic Usage

### Model `bert-base-multilingual-cased-finetuned-squad`

In [None]:
model = "../input/bertbasemultilingualcasedfinetunedsquad/bert-base-multilingual-cased-finetuned-squad"
qna = pipeline('question-answering', model = model, tokenizer = model, device = 0)

predictions = []

for question, context in test[["question", "context"]].to_numpy():
    result = qna(context=context, question=question)
    predictions.append(result["answer"])

In [None]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['PredictionString'] = predictions
submission.to_csv("submission_1.csv", index=None)

submission.head()

### Model `xlm-roberta-squad2`

In [None]:
model = "../input/xlm-roberta-squad2/deepset/xlm-roberta-base-squad2"
qna = pipeline('question-answering', model = model, tokenizer = model, device = 0)

predictions = []

for question, context in test[["question", "context"]].to_numpy():
    result = qna(context=context, question=question)
    predictions.append(result["answer"])

In [None]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['PredictionString'] = predictions
submission.to_csv("submission_2.csv", index=None)

submission.head()

In [None]:
model = "../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2"
qna = pipeline('question-answering', model = model, tokenizer = model, device = 0)

predictions = []

for question, context in test[["question", "context"]].to_numpy():
    result = qna(context=context, question=question)
    predictions.append(result["answer"])

In [None]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['PredictionString'] = predictions
submission.to_csv("submission.csv", index=None)

submission.head()