In [None]:
import os
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModel, AutoConfig

Version 6:

    1. Include the relative percentile of the token.
    2. Shown from analysis that  https://www.kaggle.com/narendra/eda-feedback-prize this will give the direct cue of where to find the class elements

In [None]:
class config:
    sample=False
    batch_size = 2
    acc_steps = 8
    epochs = 5
    max_len = 1024
    lr = 2e-5
    weight_decay=1e-3
    model_name = 'allenai/longformer-base-4096'
    device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )

In [None]:
class2label={
    'B-Lead': 0,
    'I-Lead': 1,
    
    'B-Position': 2,
    'I-Position': 3,
    
    'B-Evidence': 4,
    'I-Evidence': 5,
    
    'B-Claim': 6,
    'I-Claim': 7,
    
    'B-Concluding Statement' : 8,
    'I-Concluding Statement': 9,
    
    'B-Counterclaim': 10,
    'I-Counterclaim': 11,
    
    'B-Rebuttal': 12,
    'I-Rebuttal': 13,
    
    'O': 14,
    'PAD': -100
}

In [None]:
label2class={
    0: 'B-Lead',
    1: 'I-Lead',
    
    2: 'B-Position',
    3: 'I-Position',
    
    4: 'B-Evidence',
    5: 'I-Evidence',
    
    6: 'B-Claim',
    7: 'I-Claim',
    
    8: 'B-Concluding Statement',
    9: 'I-Concluding Statement',
    
    10: 'B-Counterclaim',
    11: 'I-Counterclaim',
    
    12: 'B-Rebuttal',
    13: 'I-Rebuttal',
    
    14: 'O'
}

In [None]:
label2segment={
    0: 'Lead',
    1: 'Lead',
    
    2: 'Position',
    3: 'Position',
    
    4: 'Evidence',
    5: 'Evidence',
    
    6: 'Claim',
    7: 'Claim',
    
    8: 'Concluding Statement',
    9: 'Concluding Statement',
    
    10: 'Counterclaim',
    11: 'Counterclaim',
    
    12: 'Rebuttal',
    13: 'Rebuttal'
}

# helper functions

In [None]:
def read_essay(filename):
    essay_folder='../input/feedback-prize-2021/train'
    filepath = os.path.join(essay_folder, filename+".txt")
    essay = ''
    with open(filepath) as file:
        essay = file.read()
    return essay


def get_labels(row):
    discourse_type = row.discourse_type
    predictionstring = row.predictionstring
    content = row.content
    labels = ['O']*len(content)
    
    for i, cls_label in enumerate(discourse_type):
        token_ids = [int(x) for x in predictionstring[i].split()]
        
        for j, token_id in enumerate(token_ids):
            label=''
            if j == 0:
                label = 'B-'+cls_label
            else:
                label = 'I-'+cls_label
            labels[token_id] = label
    return labels

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')
if config.sample:
    train_df = train_df.head(100)

train_df = train_df.groupby('id')[['discourse_type', 'predictionstring']].agg(list).reset_index()
train_df['content'] = train_df.id.apply(read_essay)
train_df['content'] = train_df['content'].apply(lambda content: content.split())
train_df['labels'] = train_df.apply(get_labels, axis=1)

train_df.head()

# dataset

In [None]:
class FeedbackDataset( torch.utils.data.Dataset ):
    def __init__(self, df, tokenizer):
        self.tokenizer=tokenizer
        df=df.copy()
        self.content = df.content.values
        self.labels = df.labels.values
    
    def get_tokenized_inputs(self, essay, labels):
        tokenized_inputs = self.tokenizer(essay, is_split_into_words=True)
        word_ids = tokenized_inputs.word_ids()
        labelids = []
        prv_word_idx=None
        for word_id in word_ids:
            if (word_id is None) or (prv_word_idx == word_id):
                labelids.append( class2label['PAD'] )
            elif prv_word_idx != word_id:
                labelids.append( class2label[ labels[word_id] ] )
            prv_word_idx = word_id
        return (tokenized_inputs, labelids, word_ids)
    
    
    def __getitem__(self, idx):
        essay  = self.content[idx]
        labels = self.labels[idx]
        (tokenized_inputs, labelids, word_ids) = self.get_tokenized_inputs(essay, labels)
        word_ids[0] = -100
        word_ids[-1] = -100
        
        input_ids = tokenized_inputs['input_ids'][:config.max_len]
        attn_mask = tokenized_inputs['attention_mask'][:config.max_len]
        labelids  = labelids[:config.max_len]
        word_ids = word_ids[:config.max_len]
        seq_len = len(input_ids)
        
        if seq_len < config.max_len:
            len_diff = config.max_len - seq_len
            attn_mask += [0] * len_diff
            labelids  += [-100] * len_diff
            input_ids += [self.tokenizer.pad_token_id] * len_diff
            word_ids += [-100] * len_diff
        
        rpercentile = ((1 + np.arange(0, config.max_len))/seq_len) - 0.5
        
        input_ids=torch.tensor(input_ids, dtype=torch.long)
        attn_mask = torch.tensor(attn_mask, dtype=torch.long)
        y = torch.tensor(labelids, dtype=torch.long)
        seq_len = torch.tensor(seq_len, dtype=torch.long)
        word_ids= torch.tensor(word_ids, dtype=torch.long)
        rpercentile = torch.tensor(rpercentile, dtype=torch.float32)
        
        return {
            'input_ids': input_ids,
            'attn_mask': attn_mask,
            'y': y,
            'word_ids': word_ids,
            'seq_len': seq_len,
            'rpercentile': rpercentile
        }
    def __len__(self):
        return len(self.labels)

# model

In [None]:
class FeedbackModel(nn.Module):
    def __init__(self, num_labels):
        super(FeedbackModel, self).__init__()
        modelconfig = AutoConfig.from_pretrained(config.model_name)

        self.backbone = AutoModel.from_pretrained(config.model_name)
        self.output = nn.Linear( 1+modelconfig.hidden_size, num_labels)
    
    def forward(self, input_ids, attn_mask, rpercentile):
        attn_outputs = self.backbone(input_ids, attn_mask)
        x=torch.cat([attn_outputs.last_hidden_state, rpercentile.unsqueeze(-1)], dim=-1)
        y=self.output(x)
        return y

# evaluate

In [None]:
def postprocess( y, word_ids):
    seq_len = len(y)
    prv_word_id=None
    predSegment=[]
    predTokens=[]
    
    preds=[]
    for i in range(seq_len):
        word_id = word_ids[i]
        if  (word_id== -100) or (prv_word_id == word_id):
            continue
        prv_word_id = word_id
        
        if y[i] not in label2segment:
            continue
        
        segment = label2segment[ y[i] ]
        predSegment.append(segment)
        predTokens.append( word_id )
    
    if len(predSegment) == 0:
        return []
    
    if len(predSegment) == 1:
        preds.append({
            'segment': predSegment[0],
            'word_ids': [predTokens[0]]
        })
        return preds
    else:
        num_tokens=len(predTokens)
        prv_id=0
        cur_id=0
        prv_segment=predSegment[0]
        
        for i in range(1, num_tokens+1):
            cur_id=i
            if (i!=num_tokens) and (predTokens[i] == 1+predTokens[i-1]) and (predSegment[i] == predSegment[i-1]):
                continue
            
            pred_token_list=[]
            for j in range(prv_id, cur_id):
                pred_token_list.append(predTokens[j])
            
            preds.append({
                'segment': prv_segment,
                'word_ids': pred_token_list
            })
            if i!=num_tokens:
                prv_segment = predSegment[i]
                prv_id=cur_id
    return preds

In [None]:
def is_positive(true_token, pred_token):
    true_segment = true_token['segment']
    pred_segment = pred_token['segment']
    
    true_wordids = set(true_token['word_ids'])
    pred_wordids = set(pred_token['word_ids'])
    
    if true_segment!=pred_segment:
        return False
    
    num_true_tokens = len(true_wordids)
    num_pred_tokens = len(pred_wordids)
    
    num_common_wordids = len( true_wordids.intersection(pred_wordids) )
    
    p1 = num_common_wordids/num_true_tokens
    p2 = num_common_wordids/num_pred_tokens
    
    if p1>=0.5 and p2 >=0.5:
        return True
    return False

In [None]:
def evaluate(val_dataloader, model):
    true_positives=0
    false_positives=0
    false_negatives=0
    
    model.eval()
    for inputs in val_dataloader:
        input_ids = inputs['input_ids']
        attn_mask = inputs['attn_mask']
        y = inputs['y']
        rpercentile = inputs['rpercentile']
        word_ids = inputs['word_ids']
        seq_len = inputs['seq_len']
        batch_max_seqlen = torch.max(seq_len).item()
        
        
        input_ids = input_ids[:, :batch_max_seqlen].to(config.device)
        attn_mask = attn_mask[:, :batch_max_seqlen].to(config.device)
        rpercentile = rpercentile[:, :batch_max_seqlen].to(config.device)
        y = y[:, :batch_max_seqlen]
        
        with torch.no_grad():
            yhat = model(input_ids, attn_mask, rpercentile).softmax(dim=-1).argmax(dim=-1).cpu()
        
        bsize = y.shape[0]
        for i in range(bsize):
            yi = y[i].numpy()
            yhati = yhat[i].numpy()
            word_ids_i = word_ids[i].numpy()
            
            true_tokens = postprocess(yi, word_ids_i)
            pred_tokens = postprocess(yhati, word_ids_i)
            
            num_true_tokens = len(true_tokens)
            num_pred_tokens = len(pred_tokens)
            
            pos=0
            for j in range(num_true_tokens):
                for k in range(num_pred_tokens):
                    if is_positive(true_tokens[j], pred_tokens[k]):
                        pos+=1
                        break
            
            
            true_positives+=pos
            false_positives += (num_pred_tokens - pos)
            false_negatives += (num_true_tokens - pos)
    
    precision = true_positives/(true_positives + false_positives)
    recall = true_positives/(true_positives + false_negatives)
    fscore = 0
    if precision!=0 and recall!=0:
        fscore = 2*precision*recall/(precision + recall)
    
    return precision, recall, fscore

# train epochs

In [None]:
def train_epoch(model, train_dataloader, optimizer, schedular, criterion):
    epoch_losses = []
    model.train()
    model.zero_grad(set_to_none=True)
    
    for i, inputs in enumerate(train_dataloader):
        input_ids = inputs['input_ids']
        attn_mask = inputs['attn_mask']
        y = inputs['y']
        seq_len = inputs['seq_len']
        rpercentile = inputs['rpercentile']
        batch_max_seqlen = torch.max(seq_len).item()

        if i%100==0:
            print('iteratin:', i, batch_max_seqlen)
            gc.collect()
            
        input_ids = input_ids[:, :batch_max_seqlen].to(config.device)
        attn_mask = attn_mask[:, :batch_max_seqlen].to(config.device)
        rpercentile = rpercentile[:, :batch_max_seqlen].to(config.device)
        y = y[:, :batch_max_seqlen].to(config.device)
        
        yhat=model(input_ids, attn_mask, rpercentile)
        loss=criterion(yhat.transpose(1, 2), y)/config.acc_steps
        loss.backward()
        
        if ((i+1)%config.acc_steps==0) or (i == len(train_dataloader)-1):
            optimizer.step()
            schedular.step()
            model.zero_grad(set_to_none=True)
        
        epoch_losses.append( loss.item() )
        
        del yhat
        del input_ids
        del attn_mask
        del y
        
    return epoch_losses
        
def train_model(fold_train_df, fold_val_df):
    tokenizer=AutoTokenizer.from_pretrained(config.model_name, add_prefix_space=True, truncate=True, padding=True)
    train_dataset = FeedbackDataset(fold_train_df, tokenizer)
    val_dataset   = FeedbackDataset(fold_val_df, tokenizer)
    
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=2*config.batch_size, shuffle=False)
    
    num_labels = len(class2label)-1
    model = FeedbackModel(num_labels).to(config.device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr,
                                  weight_decay=config.weight_decay)
    schedular = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr = config.lr,
        epochs = config.epochs,
        steps_per_epoch = 3+(len(train_dataloader)//config.acc_steps)
    )
    criterion = nn.CrossEntropyLoss(ignore_index = -100).to(config.device)
    
    print('batch size:', config.batch_size)
    print(len(train_dataloader), len(val_dataloader))
    
    best_fscore=None
    for e in range(config.epochs):
        print("started training of epoch:" ,e)
        epoch_losses = train_epoch(model, train_dataloader, optimizer, schedular, criterion)
        print('epoch loss:{:.4f}'.format(np.mean(epoch_losses)))
        print("evaluating at epoch:", e)
        precision, recall, fscore = evaluate(val_dataloader, model)
        if (best_fscore is None) or (best_fscore<fscore):
            torch.save(model, 'model.pt')
            best_fscore = fscore
        
        print('precision:{:.4f}| recall:{:.4f} | fscore:{:.4f}|best fscore:{:.4f}'.format(precision, recall, fscore, best_fscore))
        
        epoch_losses=np.array(epoch_losses)
        epoch_losses = np.cumsum(epoch_losses)
        epoch_losses = (epoch_losses[8:] - epoch_losses[:-8])/8
        if epoch_losses.shape[0] !=0:
            plt.title("epoch losses")
            plt.plot(epoch_losses)
            plt.show()

In [None]:
kfold = KFold(n_splits=5,shuffle=True, random_state=2040)

for train_index, val_index in kfold.split(train_df.id):
    fold_train_df = train_df.iloc[train_index].copy()
    fold_val_df   = train_df.iloc[val_index].copy()
    
    print(fold_train_df.shape, fold_val_df.shape)
    train_model(fold_train_df, fold_val_df)
    break