In [None]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq
!pip install -U --no-build-isolation --no-deps ../input/transformers-master/


!export CUDA_HOME=/usr/local/cuda-11.0
!env | grep CUDA

!cd ../input/apex-master-10-27-2021/apex-master/ && pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .

In [None]:
APEX_INSTALLED = False
print(APEX_INSTALLED)

In [None]:
import sys
sys.path.append("../input/tez-lib/")
from collections import Counter
import tez 
from functools import partial
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection
from string import punctuation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim
from torch.utils.data import (
    Dataset, DataLoader,
    SequentialSampler, RandomSampler
)
from torch.utils.data.distributed import DistributedSampler
import sys 
from datasets import Dataset as HFDataset
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)

In [None]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

#print(f"Apex AMP Installed :: {APEX_INSTALLED}")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


# %% [code] {"execution":{"iopub.status.busy":"2021-09-25T15:05:01.923913Z","iopub.execute_input":"2021-09-25T15:05:01.924388Z","iopub.status.idle":"2021-09-25T15:05:01.936163Z","shell.execute_reply.started":"2021-09-25T15:05:01.924353Z","shell.execute_reply":"2021-09-25T15:05:01.935458Z"}}
class DatasetRetriever(Dataset):
    def __init__(self, features, mode='train'):
        super(DatasetRetriever, self).__init__()
        self.features = features
        self.mode = mode
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):   
        feature = self.features[item]
        if self.mode == 'train':
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(feature['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(feature['start_position'], dtype=torch.long),
                'end_position':torch.tensor(feature['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':feature['offset_mapping'],
                'sequence_ids':feature['sequence_ids'],
                'id':feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            }

# %% [code] {"execution":{"iopub.status.busy":"2021-09-25T15:05:01.938192Z","iopub.execute_input":"2021-09-25T15:05:01.938481Z","iopub.status.idle":"2021-09-25T15:05:01.950199Z","shell.execute_reply.started":"2021-09-25T15:05:01.938449Z","shell.execute_reply":"2021-09-25T15:05:01.949435Z"}}
class Model(nn.Module):
    def __init__(self, modelname_or_path, config):
        super(Model, self).__init__()
        self.config = config
        self.xlm_roberta = AutoModel.from_pretrained(modelname_or_path, config=config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(
        self, 
        input_ids, 
        attention_mask=None, 
    ):
        outputs = self.xlm_roberta(
            input_ids,
            attention_mask=attention_mask,
        )

        sequence_output = outputs[0]
        #pooled_output = outputs[1]
        
        # sequence_output = self.dropout(sequence_output)
        qa_logits = self.qa_outputs(sequence_output)
        
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
    
        return start_logits, end_logits

# %% [code] {"execution":{"iopub.status.busy":"2021-09-25T15:05:01.951716Z","iopub.execute_input":"2021-09-25T15:05:01.951979Z","iopub.status.idle":"2021-09-25T15:05:01.96162Z","shell.execute_reply.started":"2021-09-25T15:05:01.951948Z","shell.execute_reply":"2021-09-25T15:05:01.96087Z"}}
def make_model(config):
    model_config = AutoConfig.from_pretrained(config.config_name)
    tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
    model = Model(config.tokenizer_name, config=model_config)
    return config, tokenizer, model

# %% [code] {"execution":{"iopub.status.busy":"2021-09-25T15:05:01.962756Z","iopub.execute_input":"2021-09-25T15:05:01.963103Z","iopub.status.idle":"2021-09-25T15:05:01.975451Z","shell.execute_reply.started":"2021-09-25T15:05:01.963068Z","shell.execute_reply":"2021-09-25T15:05:01.97473Z"}}
def prepare_test_features(args, example, tokenizer):
    example["question"] = example["question"].lstrip()
    
    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=args.max_seq_length,
        stride=args.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example['id']
        feature['context'] = example['context']
        feature['question'] = example['question']
        feature['input_ids'] = tokenized_example['input_ids'][i]
        feature['attention_mask'] = tokenized_example['attention_mask'][i]
        feature['offset_mapping'] = tokenized_example['offset_mapping'][i]
        feature['sequence_ids'] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]
        features.append(feature)
    return features

# %% [code] {"execution":{"iopub.status.busy":"2021-09-25T15:05:01.978299Z","iopub.execute_input":"2021-09-25T15:05:01.97853Z","iopub.status.idle":"2021-09-25T15:05:01.996283Z","shell.execute_reply.started":"2021-09-25T15:05:01.978506Z","shell.execute_reply":"2021-09-25T15:05:01.995601Z"}}
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30, tokenizer=None):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    
    #predictions = collections.OrderedDict()
    predictions1 = []
    predictions2 = []
    predictions3 = []
    
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    #if ')' in context[start_char: end_char] or '(' in context[start_char: end_char] or '...' in context[start_char: end_char]:
                    #    continue
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            answer_candidates = sorted(valid_answers, key=lambda x: x["score"], reverse=True)
            best_answer1 = answer_candidates[0]["text"]
            try:
                best_answer2 = answer_candidates[1]["text"]
            except: 
                best_answer2 = ''
            try:
                best_answer3 = answer_candidates[2]["text"]
            except: 
                best_answer3 = ''   
        else:
            best_answer1 = best_answer2 = best_answer3 = '' #{"text": "", "score": 0.0}
            
        # Multi white space removal
        best_answer1 = " ".join(best_answer1.split())
        # Punc removal 
        best_answer1 = best_answer1.strip(punctuation)
        predictions1.append(best_answer1)
        
        # Multi white space removal
        best_answer2 = " ".join(best_answer2.split())
        # Punc removal 
        best_answer2 = best_answer2.strip(punctuation)
        predictions2.append(best_answer2)
        
        """
        # Multi white space removal
        best_answer3 = " ".join(best_answer3.split())
        # Punc removal 
        best_answer3 = best_answer3.strip(punctuation)
        predictions3.append(best_answer3)
        """
        
    return predictions1, predictions2 #, predictions3



def get_predictions(global_config, checkpoint_path, test_dataloader):
    config, tokenizer, model = make_model(global_config)
    model.cuda()
    model.load_state_dict(torch.load(checkpoint_path)) #, map_location='cuda:0'))
    model.half()
    model.eval()
    print(f'Running inference for model: {checkpoint_path}') 
    start_logits = []
    end_logits = []
    for batch in test_dataloader:
        with torch.no_grad():
            outputs_start, outputs_end = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)



class ChaiiModel(tez.Model):
    def __init__(self, model_name, num_train_steps, steps_per_epoch, learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.steps_per_epoch = steps_per_epoch
        self.model_name = model_name
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

        hidden_dropout_prob: float = 0.0
        layer_norm_eps: float = 1e-7

        config = transformers.AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = transformers.AutoModel.from_pretrained(model_name, config=config)
        self.output = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, ids, mask, token_type_ids=None, start_positions=None, end_positions=None):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out[0]
        logits = self.output(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        return (start_logits, end_logits), 0, {}


class ChaiiDataset:
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return {
            "ids": torch.tensor(self.data[item]["input_ids"], dtype=torch.long),
            "mask": torch.tensor(self.data[item]["attention_mask"], dtype=torch.long),
        }


def get_predictions_tez(model_path, data_loader):
    print(f'Running inference for model {model_path}')
    model = ChaiiModel(model_name="../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2", num_train_steps=0, steps_per_epoch=0, learning_rate=0)
    model.load(model_path, weights_only=True)
    model.half()
    model.to("cuda")
    model.eval()

    start_logits = []
    end_logits = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output, _, _ = model(**data)
            start = output[0].detach().cpu().numpy()
            end = output[1].detach().cpu().numpy()
            start_logits.append(start)
            end_logits.append(end)

    start_logits = np.vstack(start_logits)
    end_logits = np.vstack(end_logits)

    return start_logits, end_logits




## Post Processing

In [None]:
def clean_pred(predictions):
  
    bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
    bad_endings = ["...", "-", "(", ")", "–", ",", ";"]

    tamil_ad = "கி.பி"
    tamil_bc = "கி.மு"
    tamil_km = "கி.மீ"
    hindi_ad = "ई"
    hindi_bc = "ई.पू"
    hindi_ad1 = "ए.डी"

    cleaned_preds = []
    for pred in predictions:
        while any([pred.startswith(y) for y in bad_starts]):
            pred = pred[1:]
            
        while any([pred.endswith(y) for y in bad_endings]):
            if pred.endswith("..."):
                pred = pred[:-3]
            else:
                pred = pred[:-1]

            if any([pred.endswith(tamil_ad), pred.endswith(tamil_bc), pred.endswith(tamil_km), pred.endswith(hindi_ad), pred.endswith(hindi_ad1), pred.endswith(hindi_bc)]) and pred+"." in context:
                pred = pred+"."
        
        cleaned_preds.append(pred)

    return cleaned_preds

## Voting Function

In [None]:
model1 = '../input/rembert-pt'
model2 = model3 = '../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'
model4 = '../input/muril-large-pt/muril-large-cased'
model_archs = [model1, model2, model3, model4]

tokenizers = [
                 AutoTokenizer.from_pretrained(model1),
                 AutoTokenizer.from_pretrained(model2),
                 AutoTokenizer.from_pretrained(model3),
                 AutoTokenizer.from_pretrained(model4),
                ]
    

    
tk_rembert_base = '../input/d/trushk/chaii-rembert-1024-all/'
tk_rembert_trial = [
                tk_rembert_base+'checkpoint-fold-0',
                tk_rembert_base+'checkpoint-fold-1',
                #rembert_base+'checkpoint-fold-2',
                tk_rembert_base+'checkpoint-fold-3',
                tk_rembert_base+'checkpoint-fold-4'
                ]

rembert_base = '../input/rembert-5fold-1-epoch/rembert-5fold-1-epoch/'
rembert_best = [
                rembert_base+'checkpoint-fold-0',
                rembert_base+'checkpoint-fold-2',
                rembert_base+'checkpoint-fold-3',
                rembert_base+'checkpoint-fold-4'
                ]

rembert_trial = [
                rembert_base+'checkpoint-fold-2',
                tk_rembert_base+'checkpoint-fold-3',
                tk_rembert_base+'checkpoint-fold-4',
                rembert_base+'checkpoint-fold-0',
                ]


xlmr_best =  ['../input/5foldsroberta/output/checkpoint-fold-0','../input/5foldsroberta/output/checkpoint-fold-1',
                   '../input/5foldsroberta/output/checkpoint-fold-2', '../input/5foldsroberta/output/checkpoint-fold-3',
                   '../input/5foldsroberta/output/checkpoint-fold-4'
                  ]

xlmr_base = '../input/public-xlm-2pochs/'
xlmr_trial = [
    xlmr_base + 'checkpoint-fold-0',
    xlmr_base + 'checkpoint-fold-1',
    xlmr_base + 'checkpoint-fold-2',
    xlmr_base + 'checkpoint-fold-3',
    xlmr_base + 'checkpoint-fold-4',

]

tez_base = '../input/xlrm-large-tez-1015-10fold/'
tez_models = [ tez_base + 'xlm-roberta-large-squad2_fold_0.bin', tez_base + 'xlm-roberta-large-squad2_fold_1.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_2.bin', tez_base + 'xlm-roberta-large-squad2_fold_3.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_4.bin', tez_base + 'xlm-roberta-large-squad2_fold_5.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_6.bin', tez_base + 'xlm-roberta-large-squad2_fold_7.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_8.bin', tez_base + 'xlm-roberta-large-squad2_fold_9.bin',
             ]
    
tez_best = [ 
              tez_base + 'xlm-roberta-large-squad2_fold_7.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_0.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_9.bin',
              #tez_base + 'xlm-roberta-large-squad2_fold_1.bin', 
              #tez_base + 'xlm-roberta-large-squad2_fold_4.bin',
              #tez_base + 'xlm-roberta-large-squad2_fold_3.bin',
              #tez_base + 'xlm-roberta-large-squad2_fold_8.bin',
             ]


tez_trial = [ 
              tez_base + 'xlm-roberta-large-squad2_fold_7.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_0.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_9.bin',
              tez_base + 'xlm-roberta-large-squad2_fold_1.bin', 
              #tez_base + 'xlm-roberta-large-squad2_fold_6.bin', 
              #tez_base + 'xlm-roberta-large-squad2_fold_4.bin',
              #tez_base + 'xlm-roberta-large-squad2_fold_3.bin',
              #tez_base + 'xlm-roberta-large-squad2_fold_8.bin',
             ]

muril_best = [
    '../input/chaii-muril-large-1020/checkpoint-fold-0',
    '../input/chaii-muril-large-1020/checkpoint-fold-1',  
    #'../input/chaii-muril-large-1020/checkpoint-fold-4',  #Drop to replace new muril
    '../input/chaii-muril-large-1020/checkpoint-fold-3',
    '../input/chaii-muril-large-1111/checkpoint-fold-0',
    #'../input/chaii-muril-large-1111/checkpoint-fold-3'  Untested
                  ]

muril_trial = [
    '../input/chaii-muril-large-1020/checkpoint-fold-0', # val loss 1.29339.  CV 0.72
    '../input/chaii-muril-large-1020/checkpoint-fold-3',
    '../input/chaii-muril-large-1111/checkpoint-fold-0',
    '../input/rembert-new/output/checkpoint-fold-0', # labeled incorrectly. muril val loss 1.21
   # '../input/chaii-muril-large-1020/checkpoint-fold-1',  #Drop for more time for sim voter or tez. 
                  ]
    
model_paths = [ 
                    rembert_best,
                    xlmr_best,
                    tez_trial,
                    muril_best
                 ]

"""
# For debug
model_paths = [ 
                [rembert_base+'checkpoint-fold-3',],
                [xlmr_base + 'checkpoint-fold-0'],
                #['../input/5foldsroberta/output/checkpoint-fold-0'],
                #[ tez_base + 'xlm-roberta-large-squad2_fold_0.bin'],
                [tez_new_base + 'deepsetxlm-roberta-large-squad2__fold_0.bin'],
                ['../input/chaii-muril-large-1020/checkpoint-fold-0'], 
              ]
"""

test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')        
test['context'] = test['context'].apply(lambda x: ' '.join(x.split()))
test['question'] = test['question'].apply(lambda x: ' '.join(x.split()))

all_predictions1 = []
all_predictions2 = []
#all_predictions3 = []

# Iterate through all models and get features, predictions
for i in range(len(model_archs)):
        tokenizer = tokenizers[i]
        
        tez_index = 2
        # Reuse for tez xlmr 
        if i != tez_index:
             test_features = []
        
        if True:
            class Config:
                #model
                model_arch = model_archs[i]
                model_name_or_path = model_arch
                config_name = model_arch
                tokenizer_name = model_arch
                fp16 = True if APEX_INSTALLED else False
                fp16_opt_level = "O1"
                gradient_accumulation_steps = 2
                # tokenizer
                max_seq_length = 400
                doc_stride = 135
                if 'rembert' in model_paths[i][0]:
                    eval_batch_size = 128
                else:
                    eval_batch_size = 128
                # optimzer
                optimizer_type = 'AdamW'
                learning_rate = 1e-5
                weight_decay = 1e-2
                seed = 2021

            args = Config()
            print(f'Batch size: {args.eval_batch_size}')
            
            if i==tez_index:
                #test_features = [item for sublist in test_features for item in sublist]
                test_dataloader = torch.utils.data.DataLoader(
                     ChaiiDataset(test_features),
                     batch_size= args.eval_batch_size,
                     num_workers=optimal_num_of_loader_workers(),
                     pin_memory=True,
                     shuffle=False
                )
            else:
                for _, row in test.iterrows():
                    test_features += prepare_test_features(Config(), row, tokenizer)                
                test_dataset = DatasetRetriever(test_features, mode='test')
                test_dataloader = DataLoader(
                    test_dataset,
                    batch_size=args.eval_batch_size,
                    sampler=SequentialSampler(test_dataset),
                    num_workers=optimal_num_of_loader_workers(),
                    pin_memory=True,
                    drop_last=False
                )
                
        print(f'Running inference for model {model_archs[i]}')
        test_shape = test.shape[0]
                  
        all_start_logits = []
        all_end_logits = []
        fin_start_logits  = None
        for j in range(len(model_paths[i])):
            if 'tez' in model_paths[i][0]:
                start_logits, end_logits = get_predictions_tez(f'{model_paths[i][j]}', test_dataloader)
            else:
                start_logits, end_logits = get_predictions(Config(), f'{model_paths[i][j]}/pytorch_model.bin', test_dataloader)
            if fin_start_logits is None:
                fin_start_logits = start_logits
                fin_end_logits = end_logits
            else:
                fin_start_logits += start_logits
                fin_end_logits += end_logits
            gc.collect()
            torch.cuda.empty_cache()
            start_logits = fin_start_logits / len(model_paths[i])
            end_logits = fin_end_logits / len(model_paths[i])
        
        raw_predictions1, raw_predictions2 = postprocess_qa_predictions(test, test_features, (start_logits, end_logits), tokenizer=tokenizer)

        gc.collect()
        torch.cuda.empty_cache()

        predictions1 = [pred.strip() for pred in raw_predictions1]
        cleaned_predictions1 = clean_pred(predictions1)
        all_predictions1.append(cleaned_predictions1)
        
        predictions2 = [pred.strip() for pred in raw_predictions2]
        cleaned_predictions2 = clean_pred(predictions2)
        all_predictions2.append(cleaned_predictions2)
        
        """
        predictions3 = [pred.strip() for pred in raw_predictions3]
        cleaned_predictions3 = clean_pred(predictions3)
        all_predictions3.append(cleaned_predictions3)
        """
        if test_shape == 5:
            print(predictions1)
            print(predictions2)
            #print(predictions3)




## Voting Serial


In [None]:
from Levenshtein import ratio

def calc_jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    jac = float(len(c)) / (len(a) + len(b) - len(c))
    print(f'str1: {str1} str2:{str2}, jac:{jac}')
    return jac

def check_lev(str1, str2, verbose=False):
    l_ratio = ratio(str1,str2)
    if verbose:
        print(f'str1: {str1} str2:{str2}, lev:{l_ratio}')
    return l_ratio

## Change vote count if votes are similar

In [None]:
def check_similarity(votes, verbose=False):
    if verbose:
        print(f'votes {votes}')
    counter_len = len(votes)
    adder=0
    best_pred = votes[0][0]
    best_vote_count = votes[0][1]
    # If more than one vote find second best
    if counter_len > 1:
        second_best_pred = votes[1][0]
        second_best_vote_count = votes[1][1]
    else:
        second_best_pred = None
        second_best_vote_count = 0
    # If more than 2 votes, check 2nd best vs rest
    if counter_len > 2:
        for i in range(2, counter_len):
            score = check_lev(votes[1][0], votes[i][0], verbose=verbose)
            if score >= 0.3:
                adder +=1
    
    if adder > 0:
        second_best_vote_count += adder
        if verbose:
            print(f'Increasing vote by {adder}: New second best {second_best_vote_count}')
        if second_best_vote_count > best_vote_count:
            return second_best_pred
        else:
            return best_pred
    else:
        return best_pred


In [None]:
#### Iterate through all predictions and get predictions with max votes
final_predictions = []
for i in range(len(test)):
        # Vote top 1 
        predictions = [all_predictions1[0][i]] + [all_predictions1[1][i]]  + [all_predictions1[2][i]]  + [all_predictions1[3][i]]
        # Vote top 2
        #predictions = [all_predictions1[0][i]] + [all_predictions1[1][i]]  + [all_predictions1[2][i]]  + [all_predictions1[3][i]] + [all_predictions2[0][i]] + [all_predictions2[1][i]]  + [all_predictions2[2][i]]  + [all_predictions2[3][i]]
        #predictions = [all_predictions1[0][i]] + [all_predictions1[1][i]]  + [all_predictions2[0][i]] + [all_predictions2[1][i]] 
        x = Counter(predictions)
        if test_shape == 5:
            print(f'Most common: {x.most_common(1)[0]}')
            print(f'All predictions {x}')
        prediction, vote_count = x.most_common(1)[0]
        # Select majority 
        if vote_count > 1:
            final_pred = prediction
        # If no majority 
        else:
            #final_pred = calc_levenshtein_pred(all_predictions[0][i], all_predictions[2][i], all_predictions[1][i])
            # Vote top 2
            predictions = [all_predictions1[0][i]] + [all_predictions1[1][i]]  + [all_predictions1[2][i]]  + [all_predictions1[3][i]] + [all_predictions2[0][i]] + [all_predictions2[1][i]]  + [all_predictions2[2][i]]  + [all_predictions2[3][i]]
            x = Counter(predictions)
            if test_shape == 5:
                print(f'Most common: {x.most_common(1)[0]}')
                print(f'All predictions {x}')
            votes = x.most_common()
            # Max vote
            prediction, vote_count = votes[0]
            if len(votes) > 1 :
                _, vote_count2 = votes[1]
                if vote_count > vote_count2:
                    final_pred = prediction
                else:
                    if test_shape == 5:
                        verbose= True
                    else:
                        verbose = False
                    final_pred = check_similarity(votes, verbose=verbose)
            else:
                # lb 792 reference
                final_pred = all_predictions1[1][i]
            if test_shape == 5:
                print(f'Max votes for {final_pred} with {vote_count} num votes')
        final_predictions.append(final_pred)


print(final_predictions)


## Results

In [None]:
cleaned_predictions = final_predictions
test_data = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
test_data["PredictionString"] = cleaned_predictions
test_data[['id', 'PredictionString']].to_csv('submission.csv', index=False)
print(test_data["PredictionString"])