In [None]:
import gc
gc.enable()

import sys
sys.path.append("../input/tez-lib/")

import os
from os.path import exists

import pickle
import numpy as np
import pandas as pd
import tez
import torch
import torch.nn as nn
from joblib import Parallel, delayed
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoConfig, AutoModel, AutoTokenizer
%env TOKENIZERS_PARALLELISM=true

In [None]:
def save_oof(name, oof):
    f_name = f"{name}"
    print(f"Saving OOF {name}")
    with open(f_name, "wb") as f:
        pickle.dump(oof, f)
    
def load_oof(name):
    f_name = f"{name}"
    print(f"Loading OOF {name}")
    with open(f_name, "rb") as input_file:
        return pickle.load(input_file)

In [None]:
import functools
import datetime

def _Try():
    def deco(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                 print('{} {} {} {} {}\n'.format(datetime.datetime.now(), 
                                                        type(e), e, args, kwargs)) 
        return wrapper
    return deco

In [None]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}


id_target_map = {v: k for k, v in target_id_map.items()}




class args1:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/fblongformerlarge1536/"
    output = "."
    batch_size = 8
    max_len = 1600
    prefix = "model_"
    weights_only = True
    folds_use = [0,1,2,3,4]
class args2:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/longformerlarge4096/longformer-large-4096/"
    tez_model= "../input/tez-fb-large/"
    output = "."
    batch_size = 8
    max_len = 1600
    prefix = "model_"
    weights_only = True    
    

In [None]:
class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        # print(input_ids)
        # print(input_labels)

        # add start token id to the input_ids
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        # padding_length = self.max_len - len(input_ids)
        # if padding_length > 0:
        #     if self.tokenizer.padding_side == "right":
        #         input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
        #         attention_mask = attention_mask + [0] * padding_length
        #     else:
        #         input_ids = [self.tokenizer.pad_token_id] * padding_length + input_ids
        #         attention_mask = [0] * padding_length + attention_mask

        # return {
        #     "ids": torch.tensor(input_ids, dtype=torch.long),
        #     "mask": torch.tensor(attention_mask, dtype=torch.long),
        # }

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }

In [None]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output

In [None]:
class FeedbackModel(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)

        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}

In [None]:
def _prepare_test_data_helper(args, tokenizer, ids):
    test_samples = []
    for idx in ids:
        filename = os.path.join(args.input_path, "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(df, tokenizer, args):
    test_samples = []
    ids = df["id"].unique()
    ids_splits = np.array_split(ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(args, tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples

In [None]:
import tqdm
import copy
def _prepare_training_data_helper(args, tokenizer,  train_ids, df):
    training_samples = []
    for idx in train_ids:
        filename = os.path.join(args.input_path, "train", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        input_labels = copy.deepcopy(input_ids)
        offset_mapping = encoded_text["offset_mapping"]

        for k in range(len(input_labels)):
            input_labels[k] = "O"

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        temp_df = df[df["id"] == idx]
        for _, row in temp_df.iterrows():
            text_labels = [0] * len(text)
            discourse_start = int(row["discourse_start"])
            discourse_end = int(row["discourse_end"])
            prediction_label = row["discourse_type"]
            text_labels[discourse_start:discourse_end] = [1] * (discourse_end - discourse_start)
            target_idx = []
            for map_idx, (offset1, offset2) in enumerate(encoded_text["offset_mapping"]):
                if sum(text_labels[offset1:offset2]) > 0:
                    if len(text[offset1:offset2].split()) > 0:
                        target_idx.append(map_idx)

            targets_start = target_idx[0]
            targets_end = target_idx[-1]
            pred_start = "B-" + prediction_label
            pred_end = "I-" + prediction_label
            input_labels[targets_start] = pred_start
            input_labels[targets_start + 1 : targets_end + 1] = [pred_end] * (targets_end - targets_start)

        sample["input_ids"] = input_ids
        sample["input_labels"] = input_labels
        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, args, num_jobs=4):
    training_samples = []
    train_ids = df["id"].unique()

    train_ids_splits = np.array_split(train_ids, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)(args, tokenizer,  idx, df) for idx in train_ids_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [None]:
class FeedbackDatasetValid:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }

In [None]:
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp_micro(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = (
        gt_df[["id", "discourse_type", "predictionstring"]]
        .reset_index(drop=True)
        .copy()
    )
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_gt"])
        .first()["pred_id"]
        .values
    )

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined["pred_id"].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score


def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    for discourse_type, gt_subset in gt_df.groupby("discourse_type"):
        pred_subset = (
            pred_df.loc[pred_df["class"] == discourse_type]
            .reset_index(drop=True)
            .copy()
        )
        class_score = score_feedback_comp_micro(pred_subset, gt_subset)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

In [None]:
proba_thresh = {
    "Lead": 0.7,
    "Position": 0.55,
    "Evidence": 0.65,
    "Claim": 0.55,
    "Concluding Statement": 0.7,
    "Counterclaim": 0.5,
    "Rebuttal": 0.55,
}

min_thresh = {
    "Lead": 9,
    "Position": 5,
    "Evidence": 14,
    "Claim": 3,
    "Concluding Statement": 11,
    "Counterclaim": 6,
    "Rebuttal": 4,
}

In [None]:
def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    for thresh2 in range(26,27, 1):
        retval = []
        for idv in idu:
            for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
                   'Counterclaim', 'Rebuttal']:
                q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
                if len(q) == 0:
                    continue
                pst = []
                for i,r in q.iterrows():
                    pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
                start = 1
                end = 1
                for i in range(2,len(pst)):
                    cur = pst[i]
                    end = i
                    #if pst[start] == 205:
                    #   print(cur, pst[start], cur - pst[start])
                    if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                        retval.append((idv, c, jn(pst, start, end)))
                        start = i + 1
                v = (idv, c, jn(pst, start, end+1))
                #print(v)
                retval.append(v)
        roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
        roof = roof.merge(neoof, how='outer')
        return roof

In [None]:
map_clip = {'Lead':9, 'Position':5, 'Evidence':14, 'Claim':3, 'Concluding Statement':11,
             'Counterclaim':6, 'Rebuttal':4}
def threshold(df):
    df = df.copy()
    for key, value in map_clip.items():
    # if df.loc[df['class']==key,'len'] < value 
        index = df.loc[df['class']==key].query(f'len<{value}').index
        df.drop(index, inplace = True)
    return df

In [None]:
# Change to ur dataset
vdf=pd.read_csv("../input/creating-folds-properly-hopefully-p/train_folds.csv")
#vdf = pd.read_csv("../input/feedback-folds-generator/train_folds_10.csv")
print(f"Train  size {len(vdf)}")
tokenizer = AutoTokenizer.from_pretrained(args1.model)
collate = Collate(tokenizer=tokenizer)

# On old 
np.random.seed(42)
IDS = vdf.id.unique()
train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)

# Doing CV (Validation)
raw_preds = []
no_of_models = 10
FOLDS = 5

def get_fold_based_cv():
    for fold_ in range(no_of_models):
        current_idx = 0
        model_idx = int(fold_/FOLDS+1)
        modelArgs = globals()[f"args{model_idx}"]
        fold = fold_%FOLDS
        preds_file = f"oof_{modelArgs.prefix}{fold}.pickle"
        
        print(f"Using {model_idx} Model {modelArgs.tez_model} fold {fold}" )
        tokenizer = AutoTokenizer.from_pretrained(modelArgs.model)
        collate = Collate(tokenizer=tokenizer)
        valid_df = vdf[vdf["kfold"] == fold].reset_index(drop=True)
        print(valid_df.shape)

        valid_samples = prepare_training_data(valid_df, tokenizer, args1)
        valid_dataset = FeedbackDatasetValid(valid_samples, modelArgs.max_len, tokenizer)
        model = FeedbackModel(model_name=modelArgs.model, num_labels=len(target_id_map) - 1)
        
        
        if exists(preds_file):
            preds_iter = load_oof(preds_file)
        else:    
            model.load(os.path.join(modelArgs.tez_model, f"{modelArgs.prefix}{fold}.bin"), weights_only=modelArgs.weights_only)
            preds_iter = model.predict(valid_dataset, batch_size=modelArgs.batch_size, n_jobs=-1, collate_fn=collate)
        
        current_idx = 0
        final_preds = []
        final_scores = []
        raw_preds_oof = []
        for preds in preds_iter:
            raw_preds_oof.append(preds)
            pred_class = np.argmax(preds, axis=2)
            pred_scrs = np.max(preds, axis=2)
            for pred, pred_scr in zip(pred_class, pred_scrs):
                final_preds.append(pred.tolist())
                final_scores.append(pred_scr.tolist())
                
        # Save Raw preds for reuse 
        "" if exists(preds_file) else save_oof(preds_file, raw_preds_oof)
        
        for j in range(len(valid_samples)):
            tt = [id_target_map[p] for p in final_preds[j][1:]]
            tt_score = final_scores[j][1:]
            valid_samples[j]["preds"] = tt
            valid_samples[j]["pred_scores"] = tt_score

        submission = []


        for _, sample in enumerate(valid_samples):
            preds = sample["preds"]
            offset_mapping = sample["offset_mapping"]
            sample_id = sample["id"]
            sample_text = sample["text"]
            sample_pred_scores = sample["pred_scores"]

            # pad preds to same length as offset_mapping
            if len(preds) < len(offset_mapping):
                preds = preds + ["O"] * (len(offset_mapping) - len(preds))
                sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))

            idx = 0
            phrase_preds = []
            while idx < len(offset_mapping):
                start, _ = offset_mapping[idx]
                if preds[idx] != "O":
                    label = preds[idx][2:]
                else:
                    label = "O"
                phrase_scores = []
                phrase_scores.append(sample_pred_scores[idx])
                idx += 1
                while idx < len(offset_mapping):
                    if label == "O":
                        matching_label = "O"
                    else:
                        matching_label = f"I-{label}"
                    if preds[idx] == matching_label:
                        _, end = offset_mapping[idx]
                        phrase_scores.append(sample_pred_scores[idx])
                        idx += 1
                    else:
                        break
                if "end" in locals():
                    phrase = sample_text[start:end]
                    phrase_preds.append((phrase, start, end, label, phrase_scores))

            temp_df = []
            for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
                word_start = len(sample_text[:start].split())
                word_end = word_start + len(sample_text[start:end].split())
                word_end = min(word_end, len(sample_text.split()))
                ps = " ".join([str(x) for x in range(word_start, word_end)])
                if label != "O":
                    if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                        temp_df.append((sample_id, label, ps))

            temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])

            submission.append(temp_df)

        submission = pd.concat(submission).reset_index(drop=True)
        submission["len"] = submission.predictionstring.apply(lambda x: len(x.split()))
        submission = threshold(submission)
        # drop len
        submission = submission.drop(columns=["len"])
        scr = score_feedback_comp(submission,valid_df, return_class_scores=True)
        print(f"Model {fold_} {scr}")
        torch.cuda.empty_cache()
        del valid_df
        del valid_samples
        del valid_dataset
        gc.collect()
    

get_fold_based_cv()


In [None]:
# Scoring Ensemble 

def get_score(raw_preds,valid_samples):
    final_preds = []
    final_scores = []
    for preds in raw_preds:
        pred_class = np.argmax(preds, axis=2)
        pred_scrs = np.max(preds, axis=2)
        for pred, pred_scr in zip(pred_class, pred_scrs):
            final_preds.append(pred.tolist())
            final_scores.append(pred_scr.tolist())

    for j in range(len(valid_samples)):
        tt = [id_target_map[p] for p in final_preds[j][1:]]
        tt_score = final_scores[j][1:]
        valid_samples[j]["preds"] = tt
        valid_samples[j]["pred_scores"] = tt_score

    submission = []


    for _, sample in enumerate(valid_samples):
        preds = sample["preds"]
        offset_mapping = sample["offset_mapping"]
        sample_id = sample["id"]
        sample_text = sample["text"]
        sample_pred_scores = sample["pred_scores"]

        # pad preds to same length as offset_mapping
        if len(preds) < len(offset_mapping):
            preds = preds + ["O"] * (len(offset_mapping) - len(preds))
            sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))

        idx = 0
        phrase_preds = []
        while idx < len(offset_mapping):
            start, _ = offset_mapping[idx]
            if preds[idx] != "O":
                label = preds[idx][2:]
            else:
                label = "O"
            phrase_scores = []
            phrase_scores.append(sample_pred_scores[idx])
            idx += 1
            while idx < len(offset_mapping):
                if label == "O":
                    matching_label = "O"
                else:
                    matching_label = f"I-{label}"
                if preds[idx] == matching_label:
                    _, end = offset_mapping[idx]
                    phrase_scores.append(sample_pred_scores[idx])
                    idx += 1
                else:
                    break
            if "end" in locals():
                phrase = sample_text[start:end]
                phrase_preds.append((phrase, start, end, label, phrase_scores))

        temp_df = []
        for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
            word_start = len(sample_text[:start].split())
            word_end = word_start + len(sample_text[start:end].split())
            word_end = min(word_end, len(sample_text.split()))
            ps = " ".join([str(x) for x in range(word_start, word_end)])
            if label != "O":
                if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                    temp_df.append((sample_id, label, ps))

        temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])

        submission.append(temp_df)

    submission = pd.concat(submission).reset_index(drop=True)
    submission["len"] = submission.predictionstring.apply(lambda x: len(x.split()))
    submission = threshold(submission)
    # drop len
    submission = submission.drop(columns=["len"])

    scr = score_feedback_comp(submission,valid_df, return_class_scores=True)
    print(f"Model Ensemble {scr}")
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
def get_DebertaV3TokenizerFast(model_name):        
    from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
    tokenizer = DebertaV2TokenizerFast.from_pretrained(model_name)
    return tokenizer

In [None]:

vdf=pd.read_csv("../input/creating-folds-properly-hopefully-p/train_folds.csv")

# Currently mixing deberta v3 and large 
#tokenizer = get_DebertaV3TokenizerFast(args2.model)
print(args1.model)
#from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
#tokenizer = DebertaV2TokenizerFast.from_pretrained(args1.model)
tokenizer = AutoTokenizer.from_pretrained(args1.model)
collate = Collate(tokenizer=tokenizer)
no_of_models = 2
FOLDS = 1
test_samples_m1 = None
valid_data = []

#@_Try()
def inference(no_of_models):
    raw_preds = []
    test_samples = None
    for fold_ in range(no_of_models):
        current_idx = 0
        model_idx = int(fold_/FOLDS+1)
        fold = fold_%FOLDS
        modelArgs = globals()[f"args{model_idx}"]
        preds_file = f"oof_{modelArgs.prefix}{fold}.pickle"
        
        print(f"Using [{model_idx}] Model [{modelArgs.tez_model}/{modelArgs.prefix}{fold}.bin] fold [{fold}]" )
        tokenizer = AutoTokenizer.from_pretrained(modelArgs.model)
        collate = Collate(tokenizer=tokenizer)
        df = vdf[vdf["kfold"] == 0].reset_index(drop=True)
        test_samples = prepare_training_data(df, tokenizer, modelArgs)
        if model_idx == 1:
            valid_data.append(df)
            test_samples_m1 = test_samples
        test_dataset = FeedbackDataset(test_samples, modelArgs.max_len, tokenizer)
        model = FeedbackModel(model_name=modelArgs.model, num_labels=len(target_id_map) - 1)
        
        # If oof saved used them 
        if exists(preds_file):
            preds_iter = load_oof(preds_file)
        else:    
            model.load(os.path.join(modelArgs.tez_model, f"{modelArgs.prefix}{fold}.bin"), weights_only=modelArgs.weights_only)
            preds_iter = model.predict(valid_dataset, batch_size=modelArgs.batch_size, n_jobs=-1, collate_fn=collate)
            
        #model.load(os.path.join(modelArgs.tez_model, f"{modelArgs.prefix}{fold}.bin"), weights_only=modelArgs.weights_only)
        #preds_iter = model.predict(test_dataset, batch_size=modelArgs.batch_size, n_jobs=-1, collate_fn=collate)    
        
        current_idx = 0 
        for preds in preds_iter:
            preds = preds.astype(np.float16)
            preds = preds / no_of_models
            if fold_ == 0:
                raw_preds.append(preds)
            else:
                raw_preds[current_idx] += preds
                current_idx += 1
        
        valid_df = df        
        get_score(raw_preds,test_samples)       
        
        torch.cuda.empty_cache()
        del test_dataset
        del model
        gc.collect()

    return raw_preds, test_samples

raw_preds, valid_samples = inference(no_of_models)
# Scoring
valid_df =  pd.concat(valid_data).reset_index(drop=True)
#valid_df = vdf[vdf["kfold"] == 1].reset_index(drop=True) 
#collate = Collate(tokenizer=tokenizer)
#valid_samples = prepare_training_data(valid_df, tokenizer, args1) 
get_score(raw_preds,valid_samples)


In [None]:
%%time
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
# Initialize Constants
study_name = "VAL_BLEND"
models = MODELS
rand_seed = 2021
n_trials = 1000
run_submit_script = True

def objective(trial):
    weights = []
    for i in range(len(models)):
        weights.append(trial.suggest_uniform(f"w{i}", 0.144, 1.0))
    w = [weights[i]/sum(weights) for i in range(len(models))]   
    #print(w)
    temp_df = m1_df.copy()    
    score = np.zeros(m1_df['worker'].to_numpy().shape)
    for i,m in enumerate(MODELS):
        score += w[i] * m1_df[f'score_{m}'].to_numpy()
    temp_df['score'] = score
    bad_df, loss =pack_and_validate(temp_df)
    #print(f"Score {loss}")
    return loss

def optimize_weights():
    pruner = optuna.pruners.MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=0,
        interval_steps=1,
    )
    #pruner=optuna.pruners.NopPruner()
    sampler = optuna.samplers.TPESampler(seed=rand_seed)
    study = optuna.create_study(direction="maximize",
                                pruner=pruner,
                                sampler=sampler,
                                study_name=study_name,
                                load_if_exists=True)

    study.optimize(objective,
                   n_trials=n_trials,
                   timeout=None,
                   gc_after_trial=True,
                   n_jobs=-1,
                   show_progress_bar=True)

    trial = study.best_trial


    print("\n[Optuna]")
    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value)) 
    w = []
    for i in range(len(MODELS)):
        w.append(trial.params[f"w{i}"])
    return w

#w = optimize_weights() 
#print(f"Weights to Use {w} \n Total {len(w)} \n Weight Sum {sum(w)}")

In [None]:
# Funnel Models
class args1:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/funnel-transformer-large"
    tez_model= "../input/feedback-prize-submit/funnel-transformer-large-i/funnel-transformer-large-i"
    output = "."
    batch_size = 8
    max_len = 1600
    prefix = "model_funnel"
    weights_only = True
    folds_use = [0]
    
class args2:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/funneltransformermedium"
    tez_model= "../input/feedback-prize-submit/funnel-transformer-medium/funnel-transformer-medium"
    output = "."
    batch_size = 8
    max_len = 1600
    prefix = "model_medium"
    weights_only = True
    folds_use = [2]
    
class args3:
    input_path = "../input/feedback-prize-2021/"
    model = "../input/funneltransformermedium"
    tez_model= "../input/feedback-fn-bs"
    output = "."
    batch_size = 8
    max_len = 1600
    prefix = "model_"
    weights_only = False
    folds_use = [1]

In [None]:
# Fold mixup Inference 
#vdf=pd.read_csv("../input/creating-folds-properly-hopefully-p/train_folds.csv")
vdf = pd.read_csv("../input/feedback-prize-submit/train_10-folds.csv/train_10-folds.csv")
vdf = pd.read_csv("../input/feedback-folds-generator/train_folds_10.csv")

no_of_models = 3
FOLDS = 1
test_samples_arr = []
valid_data = []

@_Try()
def inference_fold_mix(raw_preds,fold = 0):
    raw_preds = []
    test_samples = None
    total_models = sum([len(globals()[f"args{i+1}"].folds_use) for i in range(0,no_of_models)])
    print(f"Total Models to infer {total_models}")
    
    for fold_ in range(0,no_of_models):
        current_idx = 0
        modelArgs = globals()[f"args{model_idx+1}"]
        
        print(f"Using [{model_idx}] Model [{modelArgs.tez_model}/{modelArgs.prefix}{fold}.bin] fold [{fold}]" )
        tokenizer = AutoTokenizer.from_pretrained(modelArgs.model)
        collate = Collate(tokenizer=tokenizer)
        df = vdf[vdf["kfold"] == fold].reset_index(drop=True)
        if model_idx == 1:
            valid_data.append(df)
        test_samples = prepare_training_data(df, tokenizer, modelArgs)
        test_dataset = FeedbackDataset(test_samples, modelArgs.max_len, tokenizer)
        
        for fold in modelArgs.folds_use:
            print(f"Inferring {model_idx} Model {modelArgs.tez_model} fold {fold}" )
            model = FeedbackModel(model_name=modelArgs.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(modelArgs.tez_model, f"{modelArgs.prefix}{fold}.bin"), weights_only=modelArgs.weights_only)
            preds_iter = model.predict(test_dataset, batch_size=modelArgs.batch_size, n_jobs=-1, collate_fn=collate)    
            current_idx = 0 
            for preds in preds_iter:
                preds = preds.astype(np.float16)
                preds = preds / total_models
                if fold_ == 0:
                    raw_preds.append(preds)
                else:
                    raw_preds[current_idx] += preds
                    current_idx += 1
            torch.cuda.empty_cache()
            del model
            gc.collect()
        del test_dataset       
        gc.collect()

    return raw_preds, test_samples

raw_preds, valid_samples = inference_fold_mix(no_of_models)

valid_df = pd.concat(valid_data).reset_index(drop=True)
tokenizer = AutoTokenizer.from_pretrained(args1.model)
collate = Collate(tokenizer=tokenizer)
valid_samples = prepare_training_data(valid_df, tokenizer, args1)

In [None]:

def test_inference():
    df = pd.read_csv(os.path.join("../input/feedback-prize-2021/", "sample_submission.csv"))
    df_ids = df["id"].unique()
    test_samples = prepare_test_data(df, tokenizer, args1)
    train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')

    raw_preds = []
    for fold_ in range(10):
        current_idx = 0
        test_dataset = FeedbackDataset(test_samples, args1.max_len, tokenizer)

        if fold_ < 5:
            model = FeedbackModel(model_name=args1.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args1.tez_model, f"model_{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset, batch_size=args1.batch_size, n_jobs=-1, collate_fn=collate)
        else:
            model = FeedbackModel(model_name=args2.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args2.tez_model, f"model_{fold_-5}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset, batch_size=args2.batch_size, n_jobs=-1, collate_fn=collate)

        current_idx = 0

        for preds in preds_iter:
            preds = preds.astype(np.float16)
            preds = preds / 10
            if fold_ == 0:
                raw_preds.append(preds)
            else:
                raw_preds[current_idx] += preds
                current_idx += 1
        torch.cuda.empty_cache()
        gc.collect()

    final_preds = []
    final_scores = []

    for rp in raw_preds:
        pred_class = np.argmax(rp, axis=2)
        pred_scrs = np.max(rp, axis=2)
        for pred, pred_scr in zip(pred_class, pred_scrs):
            pred = pred.tolist()
            pred_scr = pred_scr.tolist()
            final_preds.append(pred)
            final_scores.append(pred_scr)

    for j in range(len(test_samples)):
        tt = [id_target_map[p] for p in final_preds[j][1:]]
        tt_score = final_scores[j][1:]
        test_samples[j]["preds"] = tt
        test_samples[j]["pred_scores"] = tt_score

    submission = []
    for sample_idx, sample in enumerate(test_samples):
        preds = sample["preds"]
        offset_mapping = sample["offset_mapping"]
        sample_id = sample["id"]
        sample_text = sample["text"]
        sample_input_ids = sample["input_ids"]
        sample_pred_scores = sample["pred_scores"]
        sample_preds = []

        if len(preds) < len(offset_mapping):
            preds = preds + ["O"] * (len(offset_mapping) - len(preds))
            sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))

        idx = 0
        phrase_preds = []
        while idx < len(offset_mapping):
            start, _ = offset_mapping[idx]
            if preds[idx] != "O":
                label = preds[idx][2:]
            else:
                label = "O"
            phrase_scores = []
            phrase_scores.append(sample_pred_scores[idx])
            idx += 1
            while idx < len(offset_mapping):
                if label == "O":
                    matching_label = "O"
                else:
                    matching_label = f"I-{label}"
                if preds[idx] == matching_label:
                    _, end = offset_mapping[idx]
                    phrase_scores.append(sample_pred_scores[idx])
                    idx += 1
                else:
                    break
            if "end" in locals():
                phrase = sample_text[start:end]
                phrase_preds.append((phrase, start, end, label, phrase_scores))

        temp_df = []
        for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
            word_start = len(sample_text[:start].split())
            word_end = word_start + len(sample_text[start:end].split())
            word_end = min(word_end, len(sample_text.split()))
            ps = " ".join([str(x) for x in range(word_start, word_end)])
            if label != "O":
                if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                    if len(ps.split()) >= min_thresh[label]:
                        temp_df.append((sample_id, label, ps))

        temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])
        submission.append(temp_df)



    def threshold(df):
        df = df.copy()
        for key, value in map_clip.items():
        # if df.loc[df['class']==key,'len'] < value 
            index = df.loc[df['class']==key].query(f'len<{value}').index
            df.drop(index, inplace = True)
        return df

    submission = pd.concat(submission).reset_index(drop=True)
    submission = link_evidence(submission)
    submission.head()

    submission['len'] = submission['predictionstring'].apply(lambda x:len(x.split()))
    submission = threshold(submission)
    submission.head()

    submission[['id','class','predictionstring']].to_csv('submission.csv',index=False)