# Clean CPU, GPU memory after load model to GPU
### The helper function is based on following discussions. 
- [How can I clear GPU memory in tensorflow 2?](https://github.com/tensorflow/tensorflow/issues/36465)
- [How do I retrieve output from Multiprocessing in Python?](https://stackoverflow.com/questions/35943822/how-do-i-retrieve-output-from-multiprocessing-in-python)

### Let me know if there is other alternatives, thanks

In [None]:

import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)


In [None]:
from tqdm.auto import tqdm
import os
import sys
import random
import numpy as np
import pandas as pd
import glob
import gc
#pd.set_option('display.max_columns', None)
gc.enable()
from joblib import Parallel, delayed
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import LightningModule, LightningDataModule
from pytorch_lightning import Trainer
import multiprocessing

# transformer
from transformers import AutoTokenizer, AutoModel, AutoConfig,AutoModelForTokenClassification
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
#import warnings
#warnings.filterwarnings("error")

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

target_id_map = {
    'O': 0,
    'B-Lead': 1,
    'I-Lead': 2,
    'B-Position': 3,
    'I-Position': 4,
    'B-Claim': 5,
    'I-Claim': 6,
    'B-Counterclaim': 7,
    'I-Counterclaim': 8,
    'B-Rebuttal': 9,
    'I-Rebuttal': 10,
    'B-Evidence': 11,
    'I-Evidence': 12,
    'B-Concluding Statement': 13,
    'I-Concluding Statement': 14,
    'PAD': -100,
}
'''
target_id_map2 = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}
'''


length_threshold = {
    'Lead'                : 9,
    'Position'            : 5,
    'Claim'               : 3,
    'Counterclaim'        : 6,
    'Rebuttal'            : 4,
    'Evidence'            : 14,
    'Concluding Statement': 11,
}
#length_threshold = {
#    'Lead'                : 5,
#    'Position'            : 5,
#    'Claim'               : 3,
#    'Counterclaim'        : 5,
#    'Rebuttal'            : 4,
#    'Evidence'            : 5,
#    'Concluding Statement': 5,
#}
probability_threshold = {
    'Lead'                : 0.6,
    'Position'            : 0.6,
    'Claim'               : 0.6,
    'Counterclaim'        : 0.6,
    'Rebuttal'            : 0.6,
    'Evidence'            : 0.6,
    'Concluding Statement': 0.6,
}

id_target_map = {v: k for k, v in target_id_map.items()}
#id_target_map2 = {v: k for k, v in target_id_map2.items()}
seed_everything(2022)
os.environ["TOKENIZERS_PARALLELISM"] = "false"



# Helper function

In [None]:
def process(func):
    def worker(func,q):
        q.put(func())

    out = None
    q = multiprocessing.Queue()    
    p = multiprocessing.Process(target=worker,args=(func,q,))
    
    p.start()
    out = q.get()
    p.join()
    
    return out

# Test Data

In [None]:
DEBUG = True

if DEBUG:
    text_dir = '../input/feedback-prize-2021/train'
    valid_id = [ f.split('/')[-1][:-4] for f in glob.glob(text_dir+'/*.txt') ]
    valid_id = sorted(valid_id)[0:10000]
    num_valid = len(valid_id)
    print('len(valid_id)',len(valid_id))
else:
    text_dir = '../input/feedback-prize-2021/test'
    valid_id = [ f.split('/')[-1][:-4] for f in glob.glob(text_dir+'/*.txt') ]
    valid_id = sorted(valid_id)
    num_valid = len(valid_id)
    print('len(valid_id)',len(valid_id))

size = [os.path.getsize(text_dir+'/%s.txt'%id) for id in valid_id] 
valid_id = [id for id, s in sorted(zip(valid_id, size), key=lambda pair: -pair[1])]
del size
gc.collect()
print('len(valid_id)',len(valid_id))

In [None]:
def _prepare_test_data_helper(tokenizer, ids):
    if DEBUG:
        path = '../input/feedback-prize-2021/train'
    else:
        path = '../input/feedback-prize-2021/test'
    test_samples = []
    for idx in ids:
        filename = os.path.join(path, idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()
            text = text.replace(u'\xa0', u' ')
            #text = text.rstrip()
            #text = text.lstrip()
            
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
            truncation=True,
            max_length=1600,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]
        
        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(ids, tokenizer):
    test_samples = []
    ids_splits = np.array_split(ids, 2)

    results = Parallel(n_jobs=2, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)
    #for idx in ids:
    #    #print(idx)
    #    result = _prepare_test_data_helper(tokenizer, [idx])
    #    test_samples.extend(result)

    return test_samples

# Data Class

In [None]:

class FeedbackDataset(Dataset):
    def __init__(self, samples, tokenizer, max_len=1600):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)
        
    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        #input_ids = self.samples[idx]["input_ids"]
        input_ids = self.samples[idx]["input_ids"]
        input_ids = [self.tokenizer.cls_token_id] + input_ids
        
        #if len(input_ids) > self.max_len - 1: # need end token after
        #    input_ids = input_ids[: self.max_len - 1]
            
        input_ids = input_ids[: self.max_len - 1] + [self.tokenizer.sep_token_id]
        token_length = len(input_ids)
        
        attention_mask = [1] * len(input_ids)
        padding_length = self.max_len - len(input_ids)
        if padding_length > 0:
            #if self.tokenizer.padding_side == "right":
            input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
            attention_mask = attention_mask + [0] * padding_length
            #else:
            #    raise NotImplementedError
        
        return {
            "ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            'token_length':token_length
        }
       


class DataModule(LightningDataModule):
    def __init__(self, test_samples, tokenizer, cfg):
        super().__init__()
        self.cfg = cfg
        self.tokenizer = tokenizer
        self.test_df = test_samples

    
    def setup(self,stage):
        if stage == 'fit':
            pass
        elif stage=='predict':
            self.test_ds = FeedbackDataset(self.test_df, self.tokenizer,self.cfg.max_length)

    
    def predict_dataloader(self):
        return DataLoader(
            self.test_ds, batch_size=self.cfg.test_batch_size, 
            shuffle=False, num_workers=2,pin_memory=False
            )

In [None]:

def get_word_proba(pred_proba,valid_samples): # pred_proba (N,seq,label) -> (N,word,label)
        word_proba = []
        for i in range(len(pred_proba)):
            sample = valid_samples[i]
            proba = pred_proba[i]
            #preds = sample["preds"][1:] # token wise prediction. neglect cls
            preds = proba[1:] # (1:1599) (seq,label)
            offset_mapping = sample["offset_mapping"] # token offset mapping
            sample_id = sample["id"]
            sample_text = sample["text"]
            
            word, word_offset = text_to_word(sample_text) # word offset mapping

            token_to_text_probability = np.full((len(sample_text),15),0, np.float32)
            
            max_length = len(preds)
            for t,(start,end) in enumerate(offset_mapping):
                if t==max_length-1: break #offsetmapping > tokensize, neclect sep token
                token_to_text_probability[start:end] = preds[t] # text lettre wise proba

            text_to_word_probability = np.full((len(word),15),0, np.float32) # word wise proba
            for t,(start,end) in enumerate(word_offset):
                # average proba of all letter in word
                # (words,15)
                text_to_word_probability[t]=token_to_text_probability[start:end].mean(0) 
            
            word_proba.append(text_to_word_probability)
        
        return word_proba # (N,word,label) word not the same for samples
    
def text_to_word(text):
    word = text.split()
    word_offset = []

    start = 0
    for w in word:
        r = text[start:].find(w)

        if r==-1:
            raise NotImplementedError
        else:
            start = start+r
            end   = start+len(w)
            word_offset.append((start,end))
            #print('%32s'%w, '%5d'%start, '%5d'%r, text[start:end])
        start = end

    return word, word_offset

# DBxl

In [None]:
class DBXLModule(LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg=cfg
        modelpath = '../input/deberta-xlarge'
        config = AutoConfig.from_pretrained(modelpath)
        config.num_labels = 15
        #self.model = AutoModelForTokenClassification.from_pretrained(modelpath,config=config)
        self.model = AutoModelForTokenClassification.from_config(config=config)

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return out.logits
    
    def predict_step(self, batch, batch_idx):
        length = batch['token_length'].max().item()
        token_id_short   = batch['ids'][:,:length]
        token_mask_short = batch['mask'][:,:length]
        logits = self(token_id_short, token_mask_short) # (N,seq,label)
        proba = torch.softmax(logits, dim=2)
        proba = proba.detach().cpu().numpy()
        proba = np.pad(proba, ((0, 0),(0,self.cfg.max_length-length),(0,0)), 'constant', constant_values=0)
        return proba
    
class CFG:
    def __init__(self):
        self.max_length=1600#1536
        self.test_batch_size = 12

CFG_dbxl = CFG()

In [None]:
def dbxl_wp():
    tokenizer = AutoTokenizer.from_pretrained('../input/deberta-xlarge')
    test_samples = prepare_test_data(valid_id, tokenizer)
    dm = DataModule(test_samples,tokenizer,CFG_dbxl)

    ckPath= [
        '../input/feedback2022infer/dbxl-f0-2-ep5-bs1-lr1.2-val_f10.686.ckpt',
        '../input/feedback2022infer/dbxl-f1-2-ep5-bs1-lr1.2-val_f10.693.ckpt'
        ]

    model = DBXLModule(CFG_dbxl)
    trainer = Trainer(gpus=1,precision=16,num_sanity_val_steps=0)


    p_dbxl=None
    for path in ckPath:
        print(path)
        model.load_state_dict(torch.load(path)['state_dict'])
        preds = trainer.predict(model, datamodule=dm)
        preds = np.concatenate(preds) # proba: (N,seq,label)
        #preds = preds.astype(np.float16)

        #print(preds.shape)
        if p_dbxl is None:
            p_dbxl=preds
        else:
            p_dbxl+=preds

    #print(p_dbxl.shape)
    wp_dbxl = get_word_proba(p_dbxl,test_samples)
    #del preds, model, dm, trainer,p_dbxl ,test_samples, tokenizer
    #gc.collect()
    #torch.cuda.empty_cache()
    return wp_dbxl

In [None]:

wp_dbxl = process(dbxl_wp)


# DBv3l-f0-20/m-90mins

In [None]:
class DBv3LModule(LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg=cfg
        modelpath = '../input/deberta-v3-large/deberta-v3-large'
        config = AutoConfig.from_pretrained(modelpath)
        config.num_labels = 15
        #self.model = AutoModelForTokenClassification.from_pretrained(self.cfg['modelpath'],config=config)
        self.model = AutoModelForTokenClassification.from_config(config=config)

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return out.logits
    
    def predict_step(self, batch, batch_idx):
        length = batch['token_length'].max().item()
        token_id_short   = batch['ids'][:,:length]
        token_mask_short = batch['mask'][:,:length]
        logits = self(token_id_short, token_mask_short) # (N,seq,label)
        proba = torch.softmax(logits, dim=2)
        proba = proba.cpu().detach().numpy()
        proba = np.pad(proba, ((0, 0),(0,self.cfg.max_length-length),(0,0)), 'constant', constant_values=0)
        return proba

class CFG:
    def __init__(self):
        #self.modelpath = '../input/deberta-v3-large/deberta-v3-large'
        #self.tokpath = '../input/deberta-v3-large/deberta-v3-large'
        self.max_length=1600
        #self.num_labels=15
        self.test_batch_size = 17

CFG_dbv3l = CFG()

In [None]:

def dbv3l_wp():
    tokenizer = DebertaV2TokenizerFast.from_pretrained('../input/deberta-v3-large/deberta-v3-large')
    test_samples = prepare_test_data(valid_id, tokenizer)
    dm = DataModule(test_samples,tokenizer,CFG_dbv3l)
    trainer = Trainer(gpus=1,precision=16,num_sanity_val_steps=0)
    
    model = DBv3LModule(CFG_dbv3l)
    #ckPath= '../input/feedback2022infer/dbv3l-f0-ep5-bs2-lr1.4-val_f10.675.ckpt'
    ckPath = '../input/feedback2022infer/dbv3l/*.ckpt'
    #model.load_state_dict(torch.load(ckPath)['state_dict'])
    #p_dbv3l = trainer.predict(model, datamodule=dm)
    #p_dbv3l = np.concatenate(p_dbv3l) # proba: (N,seq,label)
    
    p_dbv3l=None
    for path in glob.glob(ckPath):
        print(path)
        model.load_state_dict(torch.load(path)['state_dict'])
        preds = trainer.predict(model, datamodule=dm)
        preds = np.concatenate(preds) # proba: (N,seq,label)
        #preds = preds.astype(np.float16)

        #print(preds.shape)
        if p_dbv3l is None:
            p_dbv3l=preds
        else:
            p_dbv3l+=preds
    

    #print(p_dbv3l.shape)
    
    wp_dbv3l = get_word_proba(p_dbv3l,test_samples)
    return wp_dbv3l

In [None]:
wp_dbv3l = process(dbv3l_wp)

# DBL

In [None]:

class DBLModule(LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg=cfg
        modelpath = '../input/debertalarge'
        config = AutoConfig.from_pretrained(modelpath)
        config.num_labels = 15
        #self.model = AutoModelForTokenClassification.from_pretrained(modelpath,config=config)
        self.model = AutoModelForTokenClassification.from_config(config=config)

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return out.logits
    
    def predict_step(self, batch, batch_idx):
        length = batch['token_length'].max().item()
        token_id_short   = batch['ids'][:,:length]
        token_mask_short = batch['mask'][:,:length]
        logits = self(token_id_short, token_mask_short) # (N,seq,label)
        proba = torch.softmax(logits, dim=2)
        proba = proba.detach().cpu().numpy()
        proba = np.pad(proba, ((0, 0),(0,self.cfg.max_length-length),(0,0)), 'constant', constant_values=0)
        return proba
    
class CFG:
    def __init__(self):
        self.max_length=1600 #1536
        self.test_batch_size = 17

CFG_dbl1 = CFG()


In [None]:

def dbl_wp():
    tokenizer = AutoTokenizer.from_pretrained('../input/debertalarge')
    test_samples = prepare_test_data(valid_id, tokenizer)
    dm = DataModule(test_samples,tokenizer,CFG_dbl1)

    trainer = Trainer(gpus=1,precision=16,num_sanity_val_steps=0)
    model = DBLModule(CFG_dbl1)

    #ckPath= [
    #    '../input/feedback2022infer/dbl-f0-ep5-bs1-lr1.3-val_f10.684.ckpt',
    #    '../input/feedback2022infer/dbl-f1-1600-2-ep5-bs1-lr1.2-val_f10.696.ckpt'
    #]
    ckPath = '../input/feedback2022infer2/dbl-2/*.ckpt'

    p_dbl1 = None
    for path in glob.glob(ckPath):
        print(path)
        model.load_state_dict(torch.load(path)['state_dict'])
        preds = trainer.predict(model, datamodule=dm)
        preds = np.concatenate(preds) # proba: (N,seq,label)
        #preds = preds.astype(np.float16)

        #print(preds.shape)
        if p_dbl1 is None:
            p_dbl1=preds
        else:
            p_dbl1+=preds

    #print(p_dbl1.shape)
    wp_dbl1 = get_word_proba(p_dbl1,test_samples)

    #del preds, model, dm, trainer,p_dbl1
    #gc.collect()
    #del tokenizer,test_samples
    #gc.collect()
    #torch.cuda.empty_cache()
    return wp_dbl1

In [None]:
wp_dbl1 = process(dbl_wp)

# Ensemble

In [None]:
def jn2(cur):
    return " ".join([str(x) for x in cur])

def link(oof):
    gap = {
        'Lead'                : 3,
        'Position'            : 3,
        'Claim'               : -1,# not link
        'Counterclaim'        : 3,
        'Rebuttal'            : 3,
        'Evidence'            : 3,
        'Concluding Statement': 3,
    }
    idu = oof['id'].unique()
    linkcat = ['Lead', 'Position', 'Evidence', 'Claim','Concluding Statement','Counterclaim', 'Rebuttal']
    retval = []
    
    for idv in idu:
        for c in linkcat:
            q = oof[(oof['id'] == idv) & (oof['class'] == c)]
            if len(q) == 0:
                continue
            
            pst = [] # list of list
            s = []
            for i,r in q.iterrows():
                pst.append([int(x) for x in r['predictionstring'].split()])
                #s.append(eval(r.score))
                s.append(r.score)
                
            cur = pst[0]
            curs = s[0]
            for i in range(1,len(pst)):
                nxt = pst[i]
                nxts = s[i]
                
                if nxt[0] - cur[-1]>gap[c]:
                    retval.append((idv, c, jn2(cur), curs))
                    cur=nxt
                    curs=nxts
                else:
                    cur = cur + nxt
                    curs = curs+nxts
            
            retval.append((idv, c, jn2(cur), curs))
                    
    roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring','score'])
    return roof

In [None]:
def word_probability_to_predict_df(text_to_word_probability, id, target_to_id, id_to_target):
    len_word = len(text_to_word_probability)
    word_predict = text_to_word_probability.argmax(-1)
    word_score   = text_to_word_probability.max(-1)
    predict_df = []
    #print(word_predict)
    t = 0
    while 1:
        if word_predict[t] not in [
            target_to_id['O'],
            target_to_id['PAD'],
        ]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t+1
            if t== len_word-1: break
            continue

        t = t+1
        if t== len_word-1: break

        #----
        if   id_to_target[b_marker_label][0]=='B':
            i_marker_label = b_marker_label+1
        elif id_to_target[b_marker_label][0]=='I':
            i_marker_label = b_marker_label
        else:
            raise NotImplementedError

        while 1:
            #print(t)
            if (word_predict[t] != i_marker_label) or (t ==len_word-1):
                end = t
                prediction_string = ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()
                discourse_type = id_to_target[b_marker_label][2:]
                discourse_score = word_score[start:end].tolist()
                #predict_df.append((id, discourse_type, prediction_string, str(discourse_score)))
                predict_df.append((id, discourse_type, prediction_string, discourse_score))
                #print(predict_df[-1])
                break
            else:
                t = t+1
                continue
        if t== len_word-1: break
    
    predict_df = pd.DataFrame(predict_df, columns=['id', 'class', 'predictionstring', 'score'])
    return predict_df

def do_threshold(submit_df, use=['length','probability']):
    df = submit_df.copy()
    df = df.fillna('')
    
    if 'length' in use:
        df['l'] = df.predictionstring.apply(lambda x: len(x.split()))
        for key, value in length_threshold.items():
            #value=3
            index = df.loc[df['class'] == key].query('l<%d'%value).index
            df.drop(index, inplace=True)

    if 'probability' in use:
        #df['s'] = df.score.apply(lambda x: np.mean(eval(x)))
        df['s'] = df.score.apply(lambda x: np.mean(x))
        #if df['id'].values[0]=='02F89B4E55CF':
        #    print(df)
        for key, value in probability_threshold.items():
            index = df.loc[df['class'] == key].query('s<%f'%value).index
            df.drop(index, inplace=True)
            
    df = df[['id', 'class', 'predictionstring']]
    return df

def get_prediction(word_proba, ids,target_to_id,id_to_target):
    df_pred = pd.DataFrame(columns = ['id','class','predictionstring'])
    
    for i in range(len(ids)):
        sample_id = ids[i]
        text_to_word_probability = word_proba[i]
        predict_df = word_probability_to_predict_df(text_to_word_probability, sample_id,target_to_id,id_to_target)
        
        #if sample_id=='02F89B4E55CF':
        #    print(predict_df)
        #submit_df = do_threshold(predict_df, use=['length','probability'])#,'probability'
        #if sample_id=='108C28958E5B':
        #    print(submit_df)
        df_pred = df_pred.append(predict_df)
        #print(df_pred)
    
    df_link = link(df_pred)
    df_lp   = do_threshold(df_link, use=['length','probability'])
    return df_lp

In [None]:
wproba_ens = [wp_dbl1[i]/4 for i in range(len(valid_id))]

In [None]:
# about 20min
sub = get_prediction(wproba_ens,valid_id,target_id_map,id_target_map)
sub.id.nunique(),len(sub)

In [None]:
sub.to_csv("submission.csv", index=False)
sub.head()