# Introduction

This is based on my Distilbert Question Answering model. Consider this as a proof of concept model where the problem was converted into a Squad v2 like dataset and Question Answering mode was utilized for prediction. Training was only performed till 3 EPOCHs and loss was 0.89 and going down (until I ran out of the GPU hours.)

The model uses Roberta and uses code to seamlessly convert the long input sequences with strides. Distilbert score was 0.26 and Roberta came out @ 0.4 where further training and code improvement should help.

https://www.kaggle.com/aliasgherman/question-answer-approach-distilbert-1-epoch

* This code converts the problem to a Question/Answer model and uses **strides** in order to be able to use smaller models (models which have input sequence limitations of 512 or lower).

---

# Reference
The only resource your will need to understand the hocus pocus below

https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb

# Change History

* V7 [LB 0.402] Original Roberta-Base Version, Strides = 128, MAX_LEN = 384
* V8 [LB 0.404] Original Roberta-Base Version, Strides = 128, MAX_LEN = 384 with Fitler by rules initial version
* V9 [ ? ] Lets try to train Roberta-Base with Strides = 128, MAX_LEN = 384
* V10 [ 0.453 ] Properly introduce the Corrected_train fields. Strids = 128+64 and MAX_LEN = 480 with Filter by rules

# Version **10 and 11, 12** are inference only notebooks. Please refer to V5 till V9 for your reference

In [None]:
#%config Completer.use_jedi = False

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
#import spacy
import pandas as pd
import os
import re
import torch
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModelForSequenceClassification
from datasets import Dataset, load_metric, DatasetDict
import nltk
import numpy as np
from tqdm.auto import tqdm
import pyarrow.feather as feather
from transformers import default_data_collator

# Constants Below

* These constants govern whether you want to train the notebook Or you want to use it for inference only.
* The loss of the model was decreasing until my GPU hours gave up. You can download the model and continue training if needed.
* squad_v2 should be True so that the model provides answers as **empty** (No Lead or No Rebuttal etc..)

In [None]:
ISLOCAL = False #if True, then it is for local development else Kaggle

PREDICTOR = "discourse_type"

squad_v2 = True

####################################################################################
# DATA Preprocessing Switches
####################################################################################
USE_LOCAL_DATASET = False # Will try to load raw_datasets from local directory if True. Else will create the dataset from scratch.

USE_SMALL_DATASET = False #If you want to train on whole dataset
SMALL_DATASET_SIZE = 10

DF_TRAIN_P1 = "df_train_01"

TOP_COLUMNS_TO_KEEP = 13 #Dont try to have a dataset with ALL The types of predictions (like Claim 9 etc..) Just keep top colunns

####################################################################################





####################################################################################
# TOKENIZER SWITCHES
####################################################################################
USE_LOCAL_TOKENIZED_DATA = False

TOKENIZER_NAME = "roberta-base" #"distilbert-base-uncased" #"bert-base-uncased" # "allenai/longformer-base-4096" # "distilbert-base-uncased"
TOKENIZED_PATH = "{}_tokenized".format(TOKENIZER_NAME)


MAX_LEN = 480
DOC_STRIDE = 200 #not multiple of 2...


####################################################################################


####################################################################################
# Tokenized DataSet Constants
####################################################################################
TOKENIZER_PATH = "{}_er".format(TOKENIZER_NAME)


####################################################################################


####################################################################################
# Model Constants
####################################################################################
USE_LOCAL_MODEL = False
MODEL_NAME = "roberta-base" #"distilbert-base-uncased" #if training a new model then the type
MODEL_PATH = "{}mdl".format(MODEL_NAME) #if training a new model then its save path
CHECKPOINT = "checkpoint-38000"

if USE_LOCAL_MODEL:
    MODEL_CHECKPOINT = os.path.join(MODEL_PATH, CHECKPOINT)
else:
    MODEL_CHECKPOINT = MODEL_PATH

BATCH_SIZE = 16

EPOCHS = 1

####################################################################################



#CHECKPOINT = "checkpoint-125"


DATASET_PATH = "raw_dataset"



TRAINOUT_PATH = "{}-finetuned".format(TOKENIZER_NAME)

#SHOULD_I_TRAIN_MODEL = False #if only inference is needed

if ISLOCAL == True:
    train_directory = "./train"
    test_directory = "./test"
    main_directory = "./"
    output_directory = "./QnA_Model_Output/"
    save_directory = "./QnA_Model_Output/"
else:
    train_directory = "../input/feedback-prize-2021/train"
    test_directory = "../input/feedback-prize-2021/test"
    main_directory = "../input/feedback-prize-2021/"
    output_directory = "../input/robertatrainedmodel/roberta-qa/roberta-op/"
    output_directory_model = "../input/robertatrainedmodel/roberta_trained_results/"
    save_directory = "./robertatrainedmodel/"
    save_directory_datasets = "./robertatraindataset/"

# Helper Functions

plot_displacy is from the excellent notebook https://www.kaggle.com/thedrcat/feedback-prize-eda-with-displacy


In [None]:
def read_train_file(currid = "423A1CA112E2", curr_dir = train_directory):
    with open(os.path.join(curr_dir, "{}.txt".format(currid)), "r") as f:
        filetext = f.read()
        
    return filetext


def plot_displacy(df, currid = "423A1CA112E2", curr_dir = train_directory):
    train = df.copy()
    ents = []
    for i, row in train[train['id'] == currid].iterrows():
        ents.append({
                        'start': int(row['discourse_start']), 
                         'end': int(row['discourse_end']), 
                         'label': row['discourse_type']
                    })

    data = read_train_file(currid, curr_dir)

    docData = {
        "text": data,
        "ents": ents,
    }

    colors = {'Lead': '#EE11D0',
              'Position': '#AB4DE1',
              'Claim': '#1EDE71',
              'Evidence': '#33FAFA',
              'Counterclaim': '#4253C1',
              'Concluding Statement': 'yellow',
              'Rebuttal': 'red'}
    options = {"ents": train.discourse_type.unique().tolist(), "colors": colors}
    spacy.displacy.render(docData, style="ent", options=options, manual=True, jupyter=True);

In [None]:
# if ISLOCAL:
#     train = pd.read_csv( os.path.join(main_directory, "corrected_train.csv") )
# else:
#     train = pd.read_csv(  "../input/feedback-prize-corrected-train-csv/corrected_train.csv")

# Use Corrected_Train Fields

In [None]:
# #plot_displacy(train)

# train["discourse_start"] = train["new_start"]
# train["discourse_end"] = train["new_end"]

# train["predictionstring"] = train["new_predictionstring"]
# train["discourse_text"] = train["text_by_new_index"]

In [None]:
# #train[PREDICTOR].unique()

# d_type_num_to_d_type = train[["discourse_type_num", "discourse_type"]].drop_duplicates(["discourse_type_num"]).set_index(["discourse_type_num"]).to_dict()["discourse_type"]

In [None]:
# d_type_num_to_d_type

# Step 1

* Convert the dataset into a pandas table with text from the file and the positions of each discourse types (start and end word position)

In [None]:
# column_to_keep = train[PREDICTOR].value_counts()[:50].index.values
# print("We will only train the QnA data set for the columns : ", column_to_keep)

In [None]:
#from tqdm.notebook import tqdm

In [None]:
# def return_training_dataset(dft, DIR = train_directory):
#     '''
#         This uses Prediction String to get start and end token numbers.
#     '''
#     DIR = train_directory
#     ret = []

#     for i in tqdm(train["id"].unique()):
#         temp = train[ train["id"] == i]
#         row = {"id" : i,
#                "context" : read_train_file(i, DIR)
#               }
#         for j in temp[PREDICTOR]:
#             p_str_beg = temp[temp[PREDICTOR] == j]["predictionstring"].values[0]
#             p_str_beg, p_str_end = p_str_beg.split()[0], p_str_beg.split()[-1]
            
#             row.update( {"start_{}".format(j) : p_str_beg,
#                          "end_{}".format(j) : p_str_end
#                         }) #append the start and end tokens of the current discourse type as a column
            
#         ret.append(row.copy()) #append a single row per file id here
#     df_train = pd.DataFrame(ret)
#     df_train = df_train.rename( columns = { i: i.replace(" ", "_") for i in df_train.columns} )
#     df_train = df_train.fillna(-91) #-91 is arbitrary choice here.
    
#     return df_train

# def return_training_dataset_v2(dft, DIR = train_directory):
#     '''
#         This uses Character positions to get start and end token numbers.
#     '''
#     print("CAUTION : We are NOT USING TOKEN POSITIONS, But character positions in the dataset now.")
#     DIR = train_directory
#     train = dft.copy()
#     ret = []

#     for i in tqdm(train["id"].unique()):
#         temp = train[ train["id"] == i]
#         txt = read_train_file(i, DIR)
        
#         count = 0
#         row = {"id" : i,
#                     "context" : txt}
        
#         for j in temp[temp[PREDICTOR].isin(column_to_keep)].itertuples():
#             p_str_beg = getattr(j , "discourse_start")
#             p_str_end = getattr(j , "discourse_end")
#             field_name = getattr(j, "discourse_type_num")
            
#             field_name = field_name.replace(" ", "|") #later replace the - with space and map to the discourse type when creating QnA dataset.
            
#             row.update({
#                     "start_{}".format(  field_name) : p_str_beg,
#                     "end_{}".format( field_name  ) : p_str_end
#               })
#             count += 1 # Just a reference number to mark each column. Provides easier processing when creating question answering set...
            
#         ret.append(row.copy()) #append a single row per file id here
#     df_train = pd.DataFrame(ret)
    
#     #df_train = df_train.rename( columns = { i: i.replace(" ", "_") for i in df_train.columns} ) Not necessary after - character introduction
    
#     df_train = df_train.fillna(-91) #-91 is arbitrary choice here.
    
#     return df_train

# if USE_LOCAL_DATASET == False:
#     df_train = return_training_dataset_v2(train,
#                                        train_directory)
# else:
#     print("Will use local dataset and not generate right now")

In [None]:
# df_train.head(1)

In [None]:
# df_train.shape

# Step 2

* Remove the spaces in the resulting dataframe columns and fill na with -91

In [None]:
# #df_train.info()
# if USE_LOCAL_DATASET == False:
#     for i in df_train.columns:
#         try:
#             df_train[i] = pd.to_numeric(df_train[i], errors = 'raise')
#         except ValueError as ve:
#             #print(ve, i)
#             continue
# else:
#     print("Will use local dataset and not generate right now")

In [None]:
# DEBUG BLOCK ONLY
#USE_LOCAL_DATASET = True

In [None]:
# if USE_LOCAL_DATASET == False:
#     if os.path.exists(save_directory_datasets) == False:
#         os.makedirs(save_directory_datasets)

#     paths = os.path.join( save_directory_datasets, DF_TRAIN_P1)
#     print("Trying to save the training dataset to {}".format(paths))
#     feather.write_feather( df_train,
#                           paths)
# else:
#     #paths = os.path.join( output_directory, DF_TRAIN_P1)
#     paths = './robertatraindataset/df_train_01'
#     assert os.path.exists( paths ), "Path does not exist for df_train part 01. {}".format(paths)
#     df_train = feather.read_feather(paths)

In [None]:
# print("loaded dataset shape is ", df_train.shape)

# Step 3

* Prepare a question and answer data set where the question is the discourse type num parameter

In [None]:
# def prepare_question_answer_dataset(df_t):
#     X_valid = df_t.copy()
    
#     unique_d = column_to_keep ##train["discourse_type_num"].unique()
#     unique_d = [x.replace(" ", "_") for x in unique_d]
#     #X_valid

#     info_cols = [x for x in X_valid.columns if x.find("start_") > -1]
    
#     ret = []
#     from collections import defaultdict
    
    
#     for i in tqdm(X_valid.iterrows(), total = len(X_valid)):
#         #for j in unique_d:
        
#         #for j, d_type in d_type_num_to_d_type.items():
#         type_counts = defaultdict(int)
        
#         for j in info_cols:
#             current_column = j.split("_")[1]
#             current_question = current_column.replace("|", " ")
#             current_question = d_type_num_to_d_type[current_question]

#             start_e = int (i[1]["start_" + current_column])
#             end_e = int (i[1]["end_" + current_column])
            
#             context = i[1]["context"]
            
            

#             if (end_e == -91) or (start_e == -91):
#                 answer_start = []
#                 answer_text = []
#             else:
                
#                 answer_start = [start_e]
#                 answer_text = [context[start_e : end_e + 1]]
                
#             type_counts[current_question] += 1
#             if answer_start == [] and type_counts[current_question] > 1:
#                 continue
#             else:
#                 ret.append( { "id" : i[1]["id"],
#                              "context" : context,
#                               "question" : current_question,
#                               "answers" : { "text" :  answer_text,
#                                             "answer_start" : answer_start},
#                               "start_position" : start_e,
#                               "end_position" : end_e,

#                             })
#     return pd.DataFrame(ret)

# if USE_LOCAL_DATASET == False:
#     df_train = prepare_question_answer_dataset(df_train)
# else:
#     print("Will use local dataset and not generate right now")

In [None]:
# df_train.head(30)

# Step 3

* Train/Test/Valid split
> * I did not like this way of split because ideally the model should not see any file in the test set. Change this if you would like. 
I changed the split based on 70 % files for training. test will never see the same file as train

In [None]:
# if USE_LOCAL_DATASET == False:
#     train_size = 0.8
    
#     train_id = np.random.choice( df_train["id"].unique(), 
#                                   int(train_size * len(df_train["id"].unique())),
#                                  replace = False)
#     valid_id = df_train.loc[~df_train["id"].isin(train_id), "id"]
#     test_id = df_train.loc[~df_train["id"].isin(train_id), "id"]
    
#     X_train = df_train[ df_train["id"].isin(train_id) ]
#     X_test = df_train[df_train["id"].isin(valid_id) ]
#     X_valid = df_train[df_train["id"].isin(valid_id) ]
    
# #     X_train, X_test = train_test_split( df_train,
# #                                        train_size = 0.7,
# #                                       random_state = 91)

# #     X_test, X_valid = train_test_split( X_test,
# #                                        train_size = 0.5,
# #                                       random_state = 91)
#     del(df_train)
# else:
#     print("Will use local dataset and not generate right now")

In [None]:
# if USE_LOCAL_DATASET == False:
#     train_set = Dataset.from_pandas(X_train)
#     test_set = Dataset.from_pandas(X_test)
#     valid_set = Dataset.from_pandas(X_valid)
# else:
#     print("Will use local dataset and not generate right now")

In [None]:
# if USE_LOCAL_DATASET == False:
#     raw_datasets = DatasetDict( {
#         'train' : train_set,
#         'test': test_set,
#         'validation': valid_set
#     })
# else:
#     print("Will use local dataset and not generate right now")

In [None]:
# import datasets

# if USE_LOCAL_DATASET == False:
#     paths = os.path.join(save_directory_datasets, DATASET_PATH)
#     print("Saving dataset to disk. ", paths)
    
#     raw_datasets.save_to_disk( paths )
#     del(X_train)
#     del(X_test)
#     del(X_valid)
#     del(train_set)
#     del(test_set)
#     del(valid_set)
    
# else:
#     #paths = os.path.join(output_directory, DATASET_PATH)
#     paths = "../input/rawdataset/raw_dataset"
#     assert os.path.exists(paths), "The dataset local path does not exist. Please retrain the notebook with appropriate switches."
#     print("Loading the dataset from local directory. ", paths)
#     raw_datasets = datasets.load_from_disk( paths )

In [None]:
# print("Shape of the raw datasets is : ", raw_datasets.shape)

# Tokenizer Steps

In [None]:
# from transformers import AutoTokenizer

In [None]:
# if USE_LOCAL_TOKENIZED_DATA == False:
#     paths = os.path.join( save_directory_datasets, TOKENIZER_PATH)
    
#     tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
#     print("Saving Tokenizer to disk", paths)
#     tokenizer.save_pretrained( paths )
    
# else:
#     #paths = os.path.join( output_directory, TOKENIZER_PATH)
#     paths = "../input/robertabasetokernizer/roberta-base_er"
#     assert os.path.exists(paths), "Tokenizer path does not exist {}".format(paths)
    
#     print("Loading the tokenizer now")
#     tokenizer = AutoTokenizer.from_pretrained( paths )

In [None]:
# import transformers
# assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast), "Tokenizer is not an instance of Fast tokenizers written in Rust. Please check https://huggingface.co/transformers/index.html#bigtable"

In [None]:
# pad_on_right = tokenizer.padding_side == "right"

# Dont Worry

Code is straight from transformers reference given on top of the notebook

In [None]:
# def prepare_train_features(examples):
#     # Some of the questions have lots of whitespace on the left, which is not useful and will make the
#     # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
#     # left whitespace
#     examples["question"] = [q.lstrip() for q in examples["question"]]

#     # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
#     # in one example possible giving several features when a context is long, each of those features having a
#     # context that overlaps a bit the context of the previous feature.
#     tokenized_examples = tokenizer(
#         examples["question" if pad_on_right else "context"],
#         examples["context" if pad_on_right else "question"],
#         truncation="only_second" if pad_on_right else "only_first",
#         max_length = MAX_LEN,
#         stride = DOC_STRIDE,
#         return_overflowing_tokens=True,
#         return_offsets_mapping=True,
#         padding="max_length",
#     )

#     # Since one example might give us several features if it has a long context, we need a map from a feature to
#     # its corresponding example. This key gives us just that.
#     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
#     # The offset mappings will give us a map from token to character position in the original context. This will
#     # help us compute the start_positions and end_positions.
#     offset_mapping = tokenized_examples.pop("offset_mapping")

#     # Let's label those examples!
#     tokenized_examples["start_positions"] = []
#     tokenized_examples["end_positions"] = []

#     for i, offsets in enumerate(offset_mapping):
#         # We will label impossible answers with the index of the CLS token.
#         input_ids = tokenized_examples["input_ids"][i]
#         cls_index = input_ids.index(tokenizer.cls_token_id)

#         # Grab the sequence corresponding to that example (to know what is the context and what is the question).
#         sequence_ids = tokenized_examples.sequence_ids(i)

#         # One example can give several spans, this is the index of the example containing this span of text.
#         sample_index = sample_mapping[i]
#         answers = examples["answers"][sample_index]
#         # If no answers are given, set the cls_index as answer.
#         if len(answers["answer_start"]) == 0:
#             tokenized_examples["start_positions"].append(cls_index)
#             tokenized_examples["end_positions"].append(cls_index)
#         else:
#             # Start/end character index of the answer in the text.
#             start_char = answers["answer_start"][0]
#             end_char = start_char + len(answers["text"][0])

#             # Start token index of the current span in the text.
#             token_start_index = 0
#             while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
#                 token_start_index += 1

#             # End token index of the current span in the text.
#             token_end_index = len(input_ids) - 1
#             while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
#                 token_end_index -= 1

#             # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
#             if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
#                 tokenized_examples["start_positions"].append(cls_index)
#                 tokenized_examples["end_positions"].append(cls_index)
#             else:
#                 # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
#                 # Note: we could go after the last offset if the answer is the last word (edge case).
#                 while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
#                     token_start_index += 1
#                 tokenized_examples["start_positions"].append(token_start_index - 1)
#                 while offsets[token_end_index][1] >= end_char:
#                     token_end_index -= 1
#                 tokenized_examples["end_positions"].append(token_end_index + 1)

#     return tokenized_examples

# Apply tokenizer

In [None]:
# if USE_LOCAL_TOKENIZED_DATA == False:
#     tokenized_datasets = raw_datasets.map(prepare_train_features, 
#                                           batched = True, 
#                                           remove_columns = raw_datasets["train"].column_names)
# else:
#     print("Not tokenizing the dataset as load from local disk is set.")

# Save Tokenized Dataset

In [None]:
# if USE_LOCAL_TOKENIZED_DATA == False:
#     paths = os.path.join( save_directory_datasets, TOKENIZED_PATH)
#     print("Saving the tokenized dataset to local disk for re-usage", paths)
#     tokenized_datasets.save_to_disk(paths)
# else:
#     #paths = os.path.join( output_directory, TOKENIZED_PATH)
#     paths = "../input/robertabasetokenized/roberta-base_tokenized"
#     assert os.path.exists(paths), "Switch set to local tokenized data loader but path does not exist. {}".format(paths)
#     tokenized_datasets = datasets.load_from_disk(paths)

In [None]:
# print("Tokenized dataset shape is :", tokenized_datasets.shape)

# Prepare the Model (Super-Models beware....)

In [None]:
# from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

In [None]:
# # if USE_LOCAL_MODEL == False:
# #     paths = os.path.join( save_directory , MODEL_CHECKPOINT)
# #     print("Trying to load model", MODEL_NAME)
# #     model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
# # else:
# #     paths = os.path.join( output_directory_model , MODEL_CHECKPOINT)
# #     assert os.path.exists(paths), "Path to the checkpointed model does not exist {}".format(paths)
# #     model = AutoModelForQuestionAnswering.from_pretrained(paths)

# # Below is for inference path
# model = AutoModelForQuestionAnswering.from_pretrained("../input/epoch1/FINAL_MODEL_OUTPUT/")

# Trim Dataset (If required)


In [None]:
# DEBUG BLOCK
#USE_SMALL_DATASET = True

In [None]:
# if USE_SMALL_DATASET:
#     print("CAUTION: Using a small subset of data as per the switches. ")
#     small_train_dataset = tokenized_datasets["train"].shuffle(seed = 91).select(range(SMALL_DATASET_SIZE))
#     small_eval_dataset = tokenized_datasets["validation"].shuffle(seed = 91).select(range(SMALL_DATASET_SIZE))
# #    small_test_dataset = tokenized_datasets["test"].shuffle(seed = 91).select(range(SMALL_DATASET_SIZE))
# else:
#     small_train_dataset = tokenized_datasets["train"]
#     small_eval_dataset = tokenized_datasets["validation"]
# #    small_test_dataset = tokenized_datasets["test"]

# Training Argument Setup

In [None]:
# args = TrainingArguments(
#     output_dir = MODEL_PATH,
#     logging_steps = min(5000, len(small_train_dataset)),
#     save_strategy = "steps",
#     evaluation_strategy = "steps",
#     save_steps = min(5000, len(small_train_dataset)),
#     eval_steps = min(5000, len(small_train_dataset)),
#     learning_rate=2e-5,
#     per_device_train_batch_size = BATCH_SIZE,
#     per_device_eval_batch_size = BATCH_SIZE,
#     num_train_epochs = EPOCHS,
#     weight_decay = 0.01,
#     save_total_limit = 3,
# #     fp16 = True,
# #     fp16_full_eval = True,
    
#     load_best_model_at_end = True,
#     report_to = "none",
# )

# Load DataColator

* After all no size is perfect

In [None]:
# data_collator = default_data_collator

# Initialize Trainer

* Put those shorts on

In [None]:
# trainer = Trainer(
#     model,
#     args,
#     train_dataset=small_train_dataset,   #tokenized_datasets["train"],
#     eval_dataset=small_eval_dataset,     #tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
# )

# Train Now

In [None]:
# if USE_LOCAL_MODEL == False:
#     trainer.train()
# else:
#     print("No training today for you")

In [None]:
# if USE_LOCAL_MODEL == False:
#     trainer.save_model( "./FINAL_MODEL_OUTPUT" )
# else:
#     print("Cannot train without my training shorts.")



In [None]:
import transformers, datasets
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

tokenized_datasets = datasets.load_from_disk("../input/robertaqnatokenized/roberta-base_tokenized")



small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed = 91).select(range(10))

tokenizer = AutoTokenizer.from_pretrained( "../input/robertabasetokernizer/roberta-base_er" )

model = AutoModelForQuestionAnswering.from_pretrained("../input/roberta-qna-train/FINAL_MODEL_OUTPUT/")

raw_datasets = datasets.load_from_disk("../input/rawdataset/raw_dataset")


trainer = Trainer(model)

pad_on_right = tokenizer.padding_side == "right"

# Evaluation

In [None]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=MAX_LEN,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

# Magic Happens here

* Now the issue with Question Answering trained on Discourse_Type is, that if we select **best answer** only, then we can have 1 Lead, or 1 Evidence etc...

* I have modified the code to churn out all the answers which are better than the **null_score** for a specific question. Hence, we are providing multiple possible answers to a single question. This is how the model churns the outputs

In [None]:
import collections

def calc_word_indices(full_text, discourse_start, discourse_end):
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if len(output) == 0:
        return [] #edge case. If the text predicted was "\r" or "\n" then the output would be empty throwing error at output[-1]
    
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))
    return output
    

def get_raw_predictions( submission_dataset, 
                        model = trainer,
                           column_to_be_removed = raw_datasets["validation"].column_names ):
    print("Processing a dataset of length ", len(submission_dataset) )

    validation_features = submission_dataset.map(
        prepare_validation_features,
        batched = True,
        remove_columns = column_to_be_removed
    )
    
    validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))
    
    print("Now appending features per example.")
    
    import collections
    examples = submission_dataset
    features = validation_features

    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    
    
    
    return postprocess_qa_predictions(examples,
                                      features,
                                      model.predict(validation_features).predictions,
                                      n_best_size = 20,
                                      max_answer_length = 110
                                     )


from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 110):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    #predictions = collections.OrderedDict()
    predictions = []

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        question = example["question"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char],
                            "discourse_start" : start_char, 
                            "discourse_end" : end_char,
                            "question" : question
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"score": 0.0,
                           "text": "", 
                           "discourse_start" : -91,
                           "discourse_end" : -91,
                           "question" : question}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions.append( {"id" : example["id"],
                                 "text" : best_answer["text"],
                                 "discourse_start" : best_answer["discourse_start"],
                                 "discourse_end" : best_answer["discourse_end"],
                                 "question" : best_answer["question"]
                                }
                              )
            # predictions[example["id"]] = best_answer["text"]
        else:
            if min_null_score is None:# WARNING THIS IS ACTUALLY WRONG AND INDICATES A BUG SOMEWHERE - AAM 2021-12-29
                min_null_score = 0 # WARNING THIS IS ACTUALLY WRONG AND INDICATES A BUG SOMEWHERE - AAM 2021-12-29
            
            #answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            
            any_answer = 0 #default answer is nothing..
            prev_start, prev_end = 1000, -1000
            
            valid_answers = sorted(valid_answers, key = lambda x: x["discourse_start"],reverse= False)
            # we sort valid answers by position so that we can filter below.
            for all_ans in valid_answers:
                    if all_ans["score"] >= min_null_score:
                        curr_start, curr_end = all_ans["discourse_start"], all_ans["discourse_end"]
                        if (curr_start > prev_end) or (curr_end < prev_start):
                            #This sequence has no overlap with the previous. So we may keep it. (TODO make it foolproof please)
                            any_answer += 1
                            predictions.append( {"id" : example["id"],
                                                 "text" : all_ans["text"],
                                                 "discourse_start" : curr_start,
                                                 "discourse_end" : curr_end,
                                                 "question" : question
                                                }
                                              )
                            prev_start = curr_start
                            prev_end = curr_end
                        
            if any_answer == 0: #there was no answer better than the scoring threshold.
                predictions.append( {"id" : example["id"],
                                     "text" : "",
                                     "discourse_start" : -91,
                                     "discourse_end" : -91,
                                     "question" : question
                                    }
                                  )
#             if answer == "":
#                 predictions.append( {"id" : example["id"],
#                                      "text" : "",
#                                      "discourse_start" : -91,
#                                      "discourse_end" : -91,
#                                      "question" : question
#                                     }
#                                   )
#             else:
                
#                 predictions.append( {"id" : example["id"],
#                                      "text" : answer,
#                                      "discourse_start" : best_answer["discourse_start"],
#                                      "discourse_end" : best_answer["discourse_end"],
#                                      "question" : question
#                                     }
#                                   )
                
            #predictions[example["id"]] = answer

    return predictions

# Predict Test Set

Try to apply the prediction function on the submission set now.

In [None]:
df_ss = pd.read_csv( os.path.join( main_directory, "sample_submission.csv" ) )

In [None]:
# 'id', 'context', 'question', 'answers', 'start_position', 'end_position', '__index_level_0__'
def convert_subm_to_dataset(ss_df, experimental = False):
    df_ss = ss_df.copy()
    ret = []
    counter = 0
    for i in tqdm(df_ss["id"].unique()):
        for j in ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
       'Counterclaim', 'Rebuttal']:
            counter += 1
            if experimental:
                ret.append( {"id" : "{}_{}".format(i, counter),
                         "question" : j.replace(" ", "_"),
                         "context" : read_train_file( i, test_directory ),
                         "answers" : [],
                         "start_position" : [],
                         "end_position" : [],
                         "__index_level_0__" : []
                        } )
            else:
                ret.append( {"id" : i,
                             "question" : j.replace(" ", "_"),
                             "context" : read_train_file( i, test_directory ),
                             "answers" : [],
                             "start_position" : [],
                             "end_position" : [],
                             "__index_level_0__" : []
                            } )
    
    subm_pandas = pd.DataFrame( ret )
    subm = Dataset.from_pandas( pd.DataFrame(ret) )
    return subm

def preds_to_pandas(final_pred,
                   experimental = False,
                   direc = test_directory):
    ret = []
    for i in final_pred:
        if i["text"] != "":
            if experimental == False:
                fileid = i["id"]
            else:
                fileid = i["id"].split("_")[0]
                
            txt = read_train_file( fileid , direc )
            #print("DEBUG", len(txt), i["discourse_start"], i["discourse_end"], fileid)
            predictionstring = calc_word_indices(txt, i["discourse_start"], i["discourse_end"])
            #print(predictionstring[:-1]) # THERE IS AN ERROR IN TRAINING MODEL. Last token is included by mistake.
            ret.append({ "id" : fileid,
                        "class" : i["question"].replace("_", " "),
                        "predictionstring" : " ".join([str(x) for x in predictionstring[:-1]])
                       }
                      )
    return pd.DataFrame(ret)



In [None]:
def get_line_by_line_predictions(df_ss,
                                experimental = False,
                                experimental_batchsize = 1500):
    aam2 = convert_subm_to_dataset(df_ss,
                                  experimental = experimental)
    if experimental == False:
        preds = []
        for i in tqdm(range(len(aam2))):
            p = get_raw_predictions( aam2.select([i]),
                                model = trainer,
                                column_to_be_removed = raw_datasets["validation"].column_names
                               )
            preds.extend(p)
    else:
        last_start = 0
        preds = []
        ranges = [x for x in range( experimental_batchsize, len(aam2)) if x % experimental_batchsize == 0 or x == (len(aam2)-1)]
        if len(aam2) < experimental_batchsize + 1:
            ranges = [len(aam2) - 1]
        for i in ranges:
            print("Processing entries in batches of batches. Current batch is {} to {}".format(last_start, i))
            aam2_batch = aam2.select( range(last_start, i) )
            preds_row = get_raw_predictions( aam2_batch,
                                        model = trainer,
                                        column_to_be_removed = raw_datasets["validation"].column_names
                                       )
            last_start = i
            
            preds.extend(preds_row)

    final_merged = preds_to_pandas(preds,
                                  experimental = experimental)
    return final_merged

# Get Initial Predictions

In [None]:
final_merged = get_line_by_line_predictions(df_ss, 
                                           experimental = True)

In [None]:
final_merged = final_merged[final_merged["predictionstring"].str.strip() != ""]

if len(final_merged) > 0:
    final_merged = final_merged.sort_values( by = ["id", "predictionstring"] )

In [None]:
# final_merged.to_csv("submission.csv",
#                    index = False)

# VERSION 8 Changes

* Filter by rule version 1 implemented to check
* Hand written rules to improve the predictions if needed. (I noticed small improvement in scores with such functions. I need to improve it further (like discard Evidence if it is only a single token etc.. etc..)

In [None]:
def filter_by_rules(submission_df,
                   verbose = False):
    df_temp = submission_df.copy()
    
    d = []
    for i in train["discourse_type"].unique():
        d.append( {"class" : i, "predictionstring" : "0 1", "id" : "UNKNOWN"})
    d = pd.DataFrame(d)
    
    df_temp = pd.concat([ df_temp, d], axis = 0) #To ensure that the new dataframe contains all possible categories of predictions

    df_temp = df_temp.rename(columns = {"class": "discourse_type"})
    df_temp["startpred"] = df_temp["predictionstring"].apply(lambda x: x.split()[0])
    df_temp["endpred"] = df_temp["predictionstring"].apply(lambda x: x.split()[-1])
    df_temp["totaltokens"] = df_temp["predictionstring"].apply(lambda x: len(x.split()))
    df_temp["startpred"] = pd.to_numeric(df_temp["startpred"])
    df_temp["endpred"] = pd.to_numeric(df_temp["endpred"])
    df_temp["totaltokens"] = pd.to_numeric(df_temp["totaltokens"])
    df_temp = df_temp.sort_values(by = ["id", "startpred", "totaltokens"])
    df_temp["position"] = 1
    df_temp["position"] = df_temp.groupby('id')['position'].cumsum()
    df_temp["verdict"] = "keep"

    d_details = df_temp[["discourse_type", "id"]].pivot_table( columns = ["discourse_type"],
                    index = ["id"],
                    aggfunc = len).reset_index()

    d_details = d_details.fillna(0)

    for i in tqdm(df_temp["id"].unique()):
        testing_id = i
        KEEP_TYPE = "LARGER" #either LARGER or SMALLER. Unused in this submission

        ret = []
        #check 1. Only a single Lead (for extreme cases there are max 2 Leads)
        if d_details.loc[d_details["id"] == testing_id, "Lead"].values[0] > 1:
            # Check 1: If we have leads beyond position 2, then we delete all of them (considering we do have a lead at position 1.)

            lead_at_pos1 = len(df_temp[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Lead") & (df_temp["position"] == 1)])
            lead_at_pos2 = len(df_temp[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Lead") & (df_temp["position"] == 2)])
            if lead_at_pos1 > 0: # we have a Lead at the beginning
                if lead_at_pos2 > 0: #we also have a Lead at position 2
                    if verbose:
                        print("Lead at position 1 and 2 detected..", testing_id)
                    df_temp.loc[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Lead") & (df_temp["position"] == 2)
                         , "verdict"] = "delete"
                    # We are mergind position 1 & position 2 Leads here
                    df_temp.loc[ (df_temp["id"] == testing_id) & \
                                (df_temp["discourse_type"] == "Lead") & \
                                (df_temp["position"] == 1)\
                                , "endpred"] = df_temp.loc[ (df_temp["id"] == testing_id) & \
                                                    (df_temp["discourse_type"] == "Lead") & \
                                                    (df_temp["position"] == 2),
                                                    "endpred"].values[0]
                else: #Position 2 is not a Lead so a Lead is detected at other positions. Lets Delete them.
                    if verbose:
                        print("Lead at position 1 and greater than 2 detected..", testing_id)
                    df_temp.loc[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Lead") & (df_temp["position"] > 2)
                         , "verdict"] = "delete"
            else: #We have multiple Leads and none at position 1 :)
                if verbose:
                    print("Multiple leads but none at position 1 detected.", testing_id)
                # This is a problem. We have multiple Leads but not in the beginning of the passage.
                # WE can either delete all of them. Or keep them if they are within a percentage start of the passage.
                # We are keeping the largest Lead in this case for now.
                all_leads = df_temp.loc[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Lead"), :].copy()
                all_leads = all_leads.sort_values(by = ["totaltokens"], ascending = False)
                df_temp.loc[df_temp.index.isin(all_leads.index[1:]), "verdict"] = "delete"
        if d_details.loc[d_details["id"] == testing_id, "Concluding Statement"].values[0] > 1:
            # we have multiple concluding statements here.
            if verbose:
                print("Multiple concluding statements detected. ")
            max_pos = df_temp.loc[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Concluding Statement"), "position"].values[-1]
            second_pos = df_temp.loc[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Concluding Statement"), "position"].values[-2]
            if abs(second_pos - max_pos) == 1:
                # Both values are next to each other. Lets try to merge them together
                if verbose:
                    print("Two concluding statements next to each other detected. Merging and keeping one of them", testing_id)
                df_temp.loc[ (df_temp["id"] == testing_id) & \
                            (df_temp["position"] == second_pos) & \
                            (df_temp["discourse_type"] == "Concluding Statement"), "endpred"] = df_temp.loc[ \
                                                                                          (df_temp["id"] == testing_id) & \
                                                                                         (df_temp["position"] == max_pos), "endpred"].values[0]
                df_temp.loc[(df_temp["id"] == testing_id) & \
                            (df_temp["position"] == max_pos), "verdict"] = "delete"
            else:
                if verbose:
                    print("Two concluding statements found NOT Next to each other. Keeping the last one only.", testing_id)
                max_pos = df_temp.loc[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Concluding Statement"), "position"].values[-1]
                df_temp.loc[ (df_temp["id"] == testing_id) & (df_temp["discourse_type"] == "Concluding Statement") & \
                           (df_temp["position"] != max_pos), "verdict"] = "delete"


        if d_details.loc[d_details["id"] == testing_id, "Rebuttal"].values[0] > d_details.loc[d_details["id"] == testing_id, "Counterclaim"].values[0]:
            if verbose:
                print("Number of Rebuttal is greater than Counterclaims.", testing_id)
                
    df_temp = df_temp[df_temp["id"] != "UNKNOWN"] #remove our artifact colletion
    return df_temp

def get_filtered_data(r):
    ret2 = filter_by_rules(r,
                   verbose = False)
    ret2 = ret2[ret2["verdict"] == "keep"]
    ret2 = ret2.rename( columns = { "discourse_type" : "class"})
    ret2 = ret2[["id", "class", "predictionstring"]]
    return ret2

# Final Submission

Get the filtered submission and generate submission file now

In [None]:
ret = get_filtered_data( final_merged )

ret.to_csv("submission.csv", index = False)

ret.head(1)

In [None]:
# ret.head(5)

In [None]:
# df = pd.read_csv("../input/feedback-prize-corrected-train-csv/corrected_train.csv")

In [None]:
# df["discourse_type"].unique()