In [None]:
import os
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModel, AutoConfig, LongformerTokenizer

In [None]:
device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )

In [None]:
class config:
    batch_size = 4
    acc_steps = 8
    max_len = 2048
    lr = 2e-5
    weight_decay=1e-3

In [None]:
segment2label = {
    "B": 0,
    "I": 1,
    "O": 2
}

label2segment = {
    0: "B",
    1: "I",
    2: "O"
}

discourse2label={
    'Lead': 0,
    'Position' : 1,
    'Evidence' : 2,
    'Claim' : 3,
    'Concluding Statement' : 4,
    'Counterclaim' : 5,
    'Rebuttal': 6,
    'O': 7
}
label2discourse={
    0: 'Lead',
    1: 'Position',
    2: 'Evidence',
    3: 'Claim',
    4: 'Concluding Statement',
    5: 'Counterclaim',
    6: 'Rebuttal',
    7: 'O'
}

# model

In [None]:
class FeedbackModel(nn.Module):
    def __init__(self):
        super(FeedbackModel, self).__init__()
        modelconfig = AutoConfig.from_pretrained(config.model_name)

        self.backbone = AutoModel.from_pretrained(config.model_name)
        self.fc_segment = nn.Linear(modelconfig.hidden_size, 3)
        self.fc_discourse = nn.Linear(modelconfig.hidden_size, 8)
    
    def forward(self, input_ids, attn_mask):
        attn_outputs = self.backbone(input_ids, attn_mask)
        ysegment   = self.fc_segment(attn_outputs.last_hidden_state)
        ydiscourse = self.fc_discourse(attn_outputs.last_hidden_state)
        return ysegment, ydiscourse

# dataset

In [None]:
class FeedbackDataset( torch.utils.data.Dataset ):
    def __init__(self, df, tokenizer):
        self.tokenizer=tokenizer
        df=df.copy()
        self.content = df.content.values
    
    def get_tokenized_inputs(self, essay):
        tokenized_inputs = self.tokenizer(essay, is_split_into_words=True)
        word_ids = tokenized_inputs.word_ids()
        return (tokenized_inputs, word_ids)
    
    
    def __getitem__(self, idx):
        essay  = self.content[idx]
        (tokenized_inputs, word_ids) = self.get_tokenized_inputs(essay)
        word_ids[0] = -100
        word_ids[-1] = -100
        
        input_ids = tokenized_inputs['input_ids'][:config.max_len]
        attn_mask = tokenized_inputs['attention_mask'][:config.max_len]
        word_ids = word_ids[:config.max_len]
        seq_len = len(input_ids)
        
        if seq_len < config.max_len:
            len_diff = config.max_len - seq_len
            attn_mask += [0] * len_diff
            input_ids += [self.tokenizer.pad_token_id] * len_diff
            word_ids += [-100] * len_diff
        
        rpercentile = ((1 + np.arange(0, config.max_len))/seq_len) - 0.5        
        input_ids=torch.tensor(input_ids, dtype=torch.long)
        attn_mask = torch.tensor(attn_mask, dtype=torch.long)
        seq_len = torch.tensor(seq_len, dtype=torch.long)
        word_ids= torch.tensor(word_ids, dtype=torch.long)
        rpercentile = torch.tensor(rpercentile, dtype=torch.float32)
        
        return {
            'input_ids': input_ids,
            'attn_mask': attn_mask,
            'word_ids': word_ids,
            'seq_len': seq_len,
            'rpercentile': rpercentile
        }
    def __len__(self):
        return len(self.content)

# load models

In [None]:
tokenizer=AutoTokenizer.from_pretrained('../input/longformer-base-tokenizer/longformer_large_tokenizer')
models = [
    torch.load('../input/longformer-multitask-baselinemodel/model0.pt', map_location = device),
    torch.load('../input/feedback-large-longformer-model1/model1.pt', map_location = device),
    torch.load('../input/feedback-large-longformer-model2/model2.pt', map_location = device)
]

In [None]:
def read_essay(filename):
    essay_folder='../input/feedback-prize-2021/test'
    filepath = os.path.join(essay_folder, filename)
    essay = ''
    with open(filepath) as file:
        essay = file.read()
    essay=essay.split()
    return essay

In [None]:
test_files = os.listdir('../input/feedback-prize-2021/test')
test_df = []
for filename in test_files:
    test_df.append({
        'id': filename.replace(".txt", ''),
        'content': read_essay(filename)
    })
test_df = pd.DataFrame.from_dict(test_df)

In [None]:
val_dataset   = FeedbackDataset(test_df, tokenizer)
val_dataloader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=2,
                                             shuffle=False,
                                             drop_last=False
                                            )

In [None]:
def postprocess( y, word_ids):
    seq_len = len(y)
    prv_word_id=None
    predSegment=[]
    predTokens=[]
    
    preds=[]
    for i in range(seq_len):
        word_id = word_ids[i]
        if  (word_id== -100) or (prv_word_id == word_id):
            continue
        prv_word_id = word_id
        if y[i] not in label2discourse:
            continue
        
        segment = label2discourse[ y[i] ]
        if segment == 'O':
            continue
        predSegment.append(segment)
        predTokens.append( word_id )
    
    if len(predSegment) == 0:
        return []
    
    if len(predSegment) == 1:
        preds.append({
            'segment': predSegment[0],
            'word_ids': [predTokens[0]]
        })
        return preds
    else:
        num_tokens=len(predTokens)
        prv_id=0
        cur_id=0
        prv_segment=predSegment[0]
        
        for i in range(1, num_tokens+1):
            cur_id=i
            if (i!=num_tokens) and (predTokens[i] == 1+predTokens[i-1]) and (predSegment[i] == predSegment[i-1]):
                continue
            
            pred_token_list=[]
            for j in range(prv_id, cur_id):
                pred_token_list.append(predTokens[j])
            
            preds.append({
                'segment': prv_segment,
                'word_ids': pred_token_list
            })
            if i!=num_tokens:
                prv_segment = predSegment[i]
                prv_id=cur_id
    return preds

In [None]:
def prediction(val_dataloader):
    predvalues = []
    for inputs in val_dataloader:
        input_ids = inputs['input_ids']
        attn_mask = inputs['attn_mask']
        word_ids = inputs['word_ids']
        seq_len = inputs['seq_len']
        rpercentile = inputs['rpercentile']
        batch_max_seqlen = torch.max(seq_len).item()
        
        input_ids = input_ids[:, :batch_max_seqlen].to(device)
        attn_mask = attn_mask[:, :batch_max_seqlen].to(device)
        rpercentile = rpercentile[:, :batch_max_seqlen].to(device)
        
        bsize = attn_mask.shape[0]
        yhat_discourse = torch.zeros((bsize, batch_max_seqlen, 8))
        
        for model in models:
            model.eval()
            with torch.no_grad():
                (_, ycur_discourse)  = model(input_ids, attn_mask)
                #yhat_segment = yhat_segment.softmax(dim=-1).argmax(dim=-1).cpu()
                ycur_discourse = ycur_discourse.softmax(dim=-1).cpu()
                yhat_discourse += ycur_discourse
        yhat_discourse = yhat_discourse/len(models)
        yhat_discourse = yhat_discourse.argmax(dim=-1)
        
        for i in range(bsize):
            yhat_discourse_i = yhat_discourse[i].numpy()
            word_ids_i = word_ids[i].numpy()
            
            pred_tokens = postprocess(yhat_discourse_i, word_ids_i)
            for token in pred_tokens:
                token['word_ids'] = [str(x) for x in token['word_ids']]
                token['word_ids'] = ' '.join(token['word_ids'])
            predvalues.append(pred_tokens)
    return predvalues

In [None]:
test_df['predvalues'] = prediction(val_dataloader)
test_df.head()

this threshold is picked from the training data with <1% number of words

In [None]:
min_number_threshold={
    'Lead': 8,
    'Position': 5,
    'Evidence': 11,
    'Claim': 3,
    'Concluding Statement': 9,
    'Counterclaim': 5,
    'Rebuttal': 4
}

In [None]:
submission_data=[]
for index, row in test_df.iterrows():
    predvalues = row.predvalues
    cur_lst=[]
    for pred in predvalues:
        segment = pred['segment']
        predString = pred['word_ids']
        
        if len( predString.split() ) <= min_number_threshold[segment]:
            continue
        
        cur_lst.append({
            'id': row.id,
            'class': segment,
            'predictionstring': predString,
            'ignore': False
        })
    
    if len(cur_lst) == 1:
        submission_data+=cur_lst
    else:
        for i in range(0, len(cur_lst)-1):
            cur_segment = cur_lst[i]['class']
            cur_predstring = cur_lst[i]['predictionstring']
            
            next_segment = cur_lst[i+1]['class']
            next_predstring = cur_lst[i+1]['predictionstring']
            
            x1 = int(cur_predstring.split()[-1])
            x2 = int(next_predstring.split()[0])
            
            if (cur_segment == 'Evidence') and (cur_segment == next_segment) and (x2-x1-1) == 1:
                cur_lst[i+1]['predictionstring'] = (cur_lst[i]['predictionstring'] +' '+str(x1+1)+' '+cur_lst[i+1]['predictionstring'])
                cur_lst[i]['ignore']=True
        cur_lst = [ob for ob in cur_lst if ob['ignore']==False]
        submission_data += cur_lst

submission_df = pd.DataFrame.from_dict(submission_data)
submission_df.drop(columns=['ignore'], inplace=True)
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False)