In [None]:
import os 

is_inference_flag = True
try:
    tweet_models_dir = os.listdir('/kaggle/input/albert-xl-300/')
    if len(tweet_models_dir) > 0: 
        is_inference_flag = True
except:
    is_inference_flag = False

In [None]:
print('Inference flag status :', is_inference_flag)

In [None]:
if not is_inference_flag:
    !git clone https://github.com/AnandAwasthi/Closed-domain-Question-Answering-fine-tune-Albert
    

In [None]:

import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
sub_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')



In [None]:
def jaccard(str1, str2):
    a = str1.lower().split()
    b = str2.lower().split()
    c = set(a).intersection(set(b))
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train_df.dropna(inplace=True)

In [None]:
X_train, X_test = train_test_split(train_df,test_size=0.10, random_state=42)

In [None]:
train = np.array(X_train)
val = np.array(X_test)
test = np.array(test_df)
use_cuda = True

In [None]:
!mkdir -p data
!mkdir -p data/models/albert

In [None]:
%%time

"""
Prepare training data in QA-compatible format
"""

# Adpated from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

def do_qa_train(train):
    output = {}
    output['version'] = 'v1.0'
    output['data'] = []
    paragraphs = []
    for line in train:
        context = line[1]

        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        paragraphs.append({'context': context.lower(), 'qas': qas})
        #output['data'].append({'title': 'None', 'paragraphs': paragraphs})

    output['data'].append({'title': 'None', 'paragraphs': paragraphs})
    return output

if not is_inference_flag:
    qa_X_train = do_qa_train(train)
    qa_X_test = do_qa_train(val)

    with open('data/train.json', 'w') as outfile:
        json.dump(qa_X_train, outfile)

    with open('data/val.json', 'w') as outfile:
        json.dump(qa_X_test, outfile)

In [None]:
%%time

"""
Prepare testing data in QA-compatible format
"""


def convert_test_qa_json(test):
    output = {}
    output['version'] = 'v1.0'
    output['data'] = []
    paragraphs = []
    for line in test:
        
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        paragraphs.append({'context': context.lower(), 'qas': qas})
    
    output['data'].append({'title': 'None', 'paragraphs': paragraphs})
    return output

qa_test = convert_test_qa_json(test)

with open('data/test.json', 'w') as outfile:
    json.dump(qa_test, outfile)


# <a name="Training" id="3"></a> 3. Model Training ....

Using my training kernel out to save GPU time.<br>
To train uncomment train command.

In [None]:
if not is_inference_flag:
    !export SQUAD_DIR=data \
    && python Closed-domain-Question-Answering-fine-tune-Albert/bsie/transformers/examples/run_squad.py \
      --model_type albert \
      --model_name_or_path albert-base-v2 \
      --do_train \
      --do_eval \
      --do_lower_case \
      --train_file $SQUAD_DIR/train.json \
      --predict_file $SQUAD_DIR/val.json \
      --per_gpu_train_batch_size 12 \
      --learning_rate 5e-5 \
      --num_train_epochs 1.0 \
      --max_seq_length 192 \
      --doc_stride 64 \
      --output_dir $SQUAD_DIR/models/albert \
      --save_steps 100000 \
      --threads 4 \
      --version_2_with_negative \
      --overwrite_output_dir

# <a name="Infer" id="4"></a> 4. Model INFER

In [None]:
import os
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from collections import OrderedDict
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample

from transformers.data.metrics.squad_metrics import compute_predictions_logits

if not is_inference_flag:
  model_name_or_path = "data/models/"
else:
  model_name_or_path = '/kaggle/input/albert-xl-300/'

output_dir = ""

# Config
n_best_size = 1
max_answer_length = 254
do_lower_case = True
null_score_diff_threshold = 0.0

def to_list(tensor):
    return tensor.detach().cpu().tolist()

# Setup model
config_class, model_class, tokenizer_class = (
   AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
config = config_class.from_pretrained(model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(
    model_name_or_path, do_lower_case=True)
model = model_class.from_pretrained(model_name_or_path, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

processor = SquadV2Processor()

def run_prediction(question_texts, context_text):
    """Setup function to compute predictions"""
    if question_texts[0] != 'neutral':
        examples = []

        for i, question_text in enumerate(question_texts):
            example = SquadExample(
                qas_id=str(i),
                question_text=question_text,
                context_text=context_text,
                answer_text=None,
                start_position_character=None,
                title="Predict",
                is_impossible=False,
                answers=None,
            )

            examples.append(example)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=300,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
            threads=1,
        )

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

        all_results = []

        for batch in eval_dataloader:
            model.eval()
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                example_indices = batch[3]

                outputs = model(**inputs)

                for i, example_index in enumerate(example_indices):
                    eval_feature = features[example_index.item()]
                    unique_id = int(eval_feature.unique_id)

                    output = [to_list(output[i]) for output in outputs]

                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)
                    all_results.append(result)

        output_prediction_file = "predictions.json"
        output_nbest_file = "nbest_predictions.json"
        output_null_log_odds_file = "null_predictions.json"

        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            n_best_size,
            max_answer_length,
            do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            False,  # verbose_logging
            True,  # version_2_with_negative
            null_score_diff_threshold,
            tokenizer,
        )
    else:
        predictions = OrderedDict([(0, context_text)])

    return predictions

In [None]:
%%time
# Infering on trained model for Jaccard score
jaccard_scores = []
predictions_x_test = []
for index, row in X_test.head(10).iterrows():
    context = row['text']
    selected_text = row['selected_text']
    questions = [row['sentiment']]
    preds_dict = run_prediction(questions, context)
    for key in preds_dict.keys():
        predicted_text = preds_dict[key] 
        predictions_x_test.append({'selected_text': selected_text,'predicted_text': predicted_text, 'sentiment': row['sentiment'], 'textID': row['textID']})
        jaccard_score = jaccard(selected_text, predicted_text)
        jaccard_scores.append(jaccard_score)
print('Jaccard Score', np.mean(jaccard_scores))

predictions_x_test_df = pd.DataFrame.from_dict(predictions_x_test)


predictions_x_test_df.head()

In [None]:
test_df.shape

In [None]:
%%time
# Infering on trained model
predictions = []

for index, row in test_df.iterrows():
    context = row['text']
    questions = [row['sentiment']]
    preds_dict = run_prediction(questions, context)
    for key in preds_dict.keys():
        predicted_text = preds_dict[key] 
        predictions.append({'textID': row['textID'], 'selected_text': predicted_text})
        
predictions_df = pd.DataFrame.from_dict(predictions)
output_df = sub_df.merge(predictions_df, on ='textID')
predictions_df.to_csv('submission.csv', index=False)

predictions_df.head()