In [None]:
!pip install transformers

In [2]:
from google.colab import drive
drive.mount('/gdrive')
root_dir = '/gdrive/My Drive/Feedback/clock/cluster/'

Mounted at /gdrive


In [3]:
import warnings
warnings.filterwarnings('ignore')

from torch import cuda
import numpy as np, os 
import pandas as pd, gc 
from tqdm import tqdm

from transformers import LongformerConfig, LongformerModel, LongformerTokenizerFast, AutoConfig, AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import get_scheduler
import torch
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from torchsummary import summary
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
import string
import random
from ast import literal_eval

In [4]:
zconfig = {
    'model_name': 'allenai/longformer-base-4096',
    'cluster': 1,
    'output': root_dir+'one/',
    'max_len': 1600,
    'batch': 4,
    'epochs': 15,
    'device': 'cuda',
}

In [5]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}
IDS_TO_LABELS = {v:k for k,v in target_id_map.items()}

MIN_THRESH = {
    "I-Lead": 11,
    "I-Position": 4,
    "I-Evidence": 20,
    "I-Claim": 3,
    "I-Concluding Statement": 17,
    "I-Counterclaim": 8,
    "I-Rebuttal": 7,
}

PROB_THRESH = {
    "I-Lead": 0.7,
    "I-Position": 0.3,
    "I-Evidence": 0.85,
    "I-Claim": 0.5,
    "I-Concluding Statement": 0.4,
    "I-Counterclaim": 0.45,
    "I-Rebuttal": 0.4,
}

In [6]:
def split_fold(df_train):
    ids = df_train['id'].unique()
    kf = KFold(n_splits=10, shuffle = True, random_state=42)
    for i_fold, (_, valid_index) in enumerate(kf.split(ids)):
        df_train.loc[valid_index,'fold'] = i_fold
    return df_train

In [7]:
train = pd.read_csv('/gdrive/My Drive/Feedback/clock/para_train.csv')
train.drop(columns = ['Unnamed: 0'], inplace = True)

comb_ids = pd.read_csv(root_dir+'cluster_config/all_ids_clust.csv')
comb_ids.drop(columns = ['Unnamed: 0'], inplace = True)
# comb_ids.loc[comb_ids['id'] == 'B1D7630EE532', 'cluster'] = 1
# comb_ids = comb_ids[comb_ids['cluster'] == zconfig['cluster']].copy()
comb_ids.reset_index(drop = True, inplace=True)
comb_ids = split_fold(comb_ids)

In [None]:
# df = pd.read_csv(root_dir+'upsampled_train.csv')
# df.drop(columns = ['Unnamed: 0'], inplace = True)
# df['paragraph'] = df['paragraph'].apply(literal_eval)

In [8]:
def ner(df_texts, df_train):
  all_entities = []
  all_text = []
  for _,  row in tqdm(df_texts.iterrows(), total=len(df_texts)):
    text = row['paragraph'].split()
    all_text.append(text)
    total = len(text)
    entities = ['O'] * total

    for _, row2 in df_train[df_train['id'] == row['id']].iterrows():
      discourse = row2['discourse_type']
      list_ix = [int(x) for x in row2['predictionstring'].split(' ')]
      entities[list_ix[0]] = f'B-{discourse}'
      for k in list_ix[1:]: entities[k] = f'I-{discourse}'
    all_entities.append(entities)

  df_texts['text_split'] = all_text
  df_texts['entities'] = all_entities
  return df_texts

In [None]:
tokenizer = AutoTokenizer.from_pretrained(zconfig['model_name'], add_prefix_space=True)

train_comb_ids = comb_ids[comb_ids['fold'] != float(1)].reset_index(drop = True).copy()
valid_comb_ids = comb_ids[comb_ids['fold'] == float(1)].reset_index(drop = True).copy()

train_comb_ids.drop(columns = ['cluster', 'fold', 'discourse_type'], inplace = True)
valid_comb_ids.drop(columns = ['cluster', 'fold', 'discourse_type'], inplace = True)

train_data = train[train['id'].isin(train_comb_ids['id'].values)].reset_index(drop = True)
valid_data = train[train['id'].isin(valid_comb_ids['id'].values)].reset_index(drop = True)
train_text = train_data.groupby(['id']).agg({'paragraph':'first'})
train_text.reset_index(inplace=True)
valid_text = valid_data.groupby(['id']).agg({'paragraph':'first'})
valid_text.reset_index(inplace=True)

train_text = ner(train_text, train_data)
valid_text = ner(valid_text, valid_data)

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

100%|██████████| 14034/14034 [02:49<00:00, 82.76it/s] 
100%|██████████| 1560/1560 [00:03<00:00, 474.68it/s]


In [15]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len, get_wids = True):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids
        self.LABEL_ALL_SUBTOKENS = True

  def __getitem__(self, index):
        # GET TEXT AND WORD LABELS 
        text = self.data.paragraph[index]        
        word_labels = self.data.entities[index]
        # text = paragraph_preprocessing(text)

        # TOKENIZE TEXT
        encoding = tokenizer(
            text.split(),
            is_split_into_words = True,
            max_length = self.max_len,
            truncation=True
        ) 
        
        word_ids = encoding.word_ids()
        if self.get_wids:
            prev_word_idx = None
            labels_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    labels_ids.append(-100)
                elif word_idx != prev_word_idx:
                    labels_ids.append(target_id_map[word_labels[word_idx]])
                else:
                    if self.LABEL_ALL_SUBTOKENS:
                        labels_ids.append(target_id_map[word_labels[word_idx]])
                    else:
                        labels_ids.append(-100)
                prev_word_idx = word_idx
        
            encoding['labels'] = labels_ids

        word_ids2 = [w if w is not None else -1 for w in word_ids]
        encoding['word_ids'] = word_ids2
        return encoding

  def __len__(self):
        return self.len

In [16]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["labels"] = [sample["labels"] for sample in batch]
        output["word_ids"] = [sample["word_ids"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
            output["labels"] = [s + (batch_max - len(s)) * [-100] for s in output["labels"]]
            output["word_ids"] = [s + (batch_max - len(s)) * [-1] for s in output["word_ids"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]
            output["labels"] = [(batch_max - len(s)) * [-100] + s for s in output["labels"]]
            output["word_ids"] = [(batch_max - len(s)) * [-1] + s for s in output["word_ids"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"])
        output["attention_mask"] = torch.tensor(output["attention_mask"])
        output["labels"] = torch.tensor(output["labels"])
        output["word_ids"] = torch.tensor(output["word_ids"])

        return output

In [17]:
def active_logits(raw_logits, word_ids):
    word_ids = word_ids.view(-1)
    active_mask = word_ids.unsqueeze(1).expand(word_ids.shape[0], len(target_id_map) - 1)
    active_mask = active_mask != -1
    active_logits = raw_logits.view(-1, len(target_id_map) - 1)
    active_logits = torch.masked_select(active_logits, active_mask)
    active_logits = active_logits.view(-1, len(target_id_map) - 1) 
    return active_logits

def active_labels(labels):
    active_mask = labels.view(-1) != -100
    active_labels = torch.masked_select(labels.view(-1), active_mask)
    return active_labels

def active_preds_prob(active_logits):
    active_preds = torch.argmax(active_logits, axis = 1)
    active_preds_prob, _ = torch.max(active_logits, axis = 1)
    return active_preds, active_preds_prob

In [18]:
class FeedbackModel(nn.Module):
    def __init__(self):
        super(FeedbackModel, self).__init__()
        model_config = AutoConfig.from_pretrained(zconfig['model_name'])
        self.backbone = AutoModel.from_pretrained(zconfig['model_name'], config=model_config)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.head = nn.Linear(model_config.hidden_size, len(target_id_map) - 1)
    
    def forward(self, input_ids, mask):
        x = self.backbone(input_ids, mask)
        logits1 = self.head(self.dropout1(x[0]))
        logits2 = self.head(self.dropout2(x[0]))
        logits3 = self.head(self.dropout3(x[0]))
        logits4 = self.head(self.dropout4(x[0]))
        logits5 = self.head(self.dropout5(x[0]))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits

In [None]:
def fetch_optimizer(model, type):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 2e-5
    no_decay = ['bias', 'gamma', 'beta']
    if type == 's':
        optimizer_parameters = filter(lambda x: x.requires_grad, model.parameters())
    elif type == 'i':
        optimizer_parameters = [
            {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0},
            {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
             'lr': 1e-5,
             'weight_decay_rate':0.01}
        ]
    elif type == 'a':
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
            {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.001},
            {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.01, 'lr': learning_rate/2.6},
            {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.01, 'lr': learning_rate},
            {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.01, 'lr': learning_rate*2.6},
            {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.0},
            {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.0, 'lr': learning_rate/2.6},
            {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.0, 'lr': learning_rate},
            {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.0, 'lr': learning_rate*2.6},
            {'params': [p for n, p in model.named_parameters() if "backbone" not in n], 'lr':1e-5, "momentum" : 0.99},
        ]
    return optimizer_parameters

In [None]:
def evaluate(epoch, criterion):
    model.eval()

    valid_loss = 0
    valid_accuracy = 0
    for batch in tqdm(testing_loader):

        ids = batch['input_ids'].to(zconfig['device'], dtype = torch.long)
        mask = batch['attention_mask'].to(zconfig['device'], dtype = torch.long)
        raw_labels = batch['labels'].to(zconfig['device'], dtype = torch.long)
        word_ids = batch['word_ids'].to(zconfig['device'], dtype = torch.long)

        with torch.no_grad():
            raw_logits = model(input_ids = ids, mask = mask)
        
        del ids, mask
        gc.collect()

        logits = active_logits(raw_logits, word_ids)
        labels = active_labels(raw_labels)
        preds, preds_prob = active_preds_prob(logits)

        valid_accuracy += metrics.f1_score(labels.cpu().numpy(), preds.cpu().numpy(), average='micro')
        loss = criterion(logits, labels)
        valid_loss += loss.item()

    epoch_loss = valid_loss / len(testing_loader)
    epoch_accuracy = valid_accuracy / len(testing_loader)
    return epoch_loss, epoch_accuracy

In [None]:
# def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
#     model.train()
#     scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
#     losses = AverageMeter()
#     global_step = 0
#     for step, (inputs, labels) in enumerate(train_loader):
#         for k, v in inputs.items():
#             inputs[k] = v.to(device)
#         labels = labels.to(device)
#         batch_size = labels.size(0)
#         with torch.cuda.amp.autocast(enabled=CFG.apex):
#             y_preds = model(inputs)
#         loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
#         loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
#         if CFG.gradient_accumulation_steps > 1:
#             loss = loss / CFG.gradient_accumulation_steps
#         losses.update(loss.item(), batch_size)
#         scaler.scale(loss).backward()
#         grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
#         if (step + 1) % CFG.gradient_accumulation_steps == 0:
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()
#             global_step += 1
#             if CFG.batch_scheduler:
#                 scheduler.step()
#     return losses.avg


In [None]:
def model_train(epoch):
    model.train()
    scaler = GradScaler()
    
    idx = 0
    train_loss = 0
    train_accuracy = 0
    
    for batch in tqdm(training_loader):

        ids = batch['input_ids'].to(zconfig['device'], dtype = torch.long)
        mask = batch['attention_mask'].to(zconfig['device'], dtype = torch.long)
        raw_labels = batch['labels'].to(zconfig['device'], dtype = torch.long)
        word_ids = batch['word_ids'].to(zconfig['device'], dtype = torch.long)

        model.zero_grad()
        with autocast():
            raw_logits = model(input_ids = ids, mask = mask)

        logits = active_logits(raw_logits, word_ids)
        labels = active_labels(raw_labels)
        preds, preds_prob = active_preds_prob(logits)

        criterion = torch.nn.CrossEntropyLoss()
        loss = criterion(logits, labels)

        train_accuracy += metrics.f1_score(labels.cpu().numpy(), preds.cpu().numpy(), average='micro')
        train_loss += loss.item()

        scaler.scale(loss).backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        idx += 1
    epoch_loss = train_loss / len(training_loader)
    epoch_accuracy = train_accuracy / len(training_loader)
    return epoch_loss, epoch_accuracy, criterion

In [None]:
training_set = dataset(train_text, tokenizer, zconfig['max_len'], True)
testing_set = dataset(valid_text, tokenizer, zconfig['max_len'], True)

collate = Collate(tokenizer)

train_params = {'batch_size': zconfig['batch'],
                'shuffle': True,
                'collate_fn' : collate
                }

test_params = {'batch_size': zconfig['batch'],
                'shuffle': False,
            'collate_fn' : collate
                }
                
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

model = FeedbackModel()
model.to(zconfig['device'])

# layer_idx = 0
# for param in model.parameters():
#     if layer_idx <= 224:
#         param.requires_grad = False
#     layer_idx += 1

num_steps = zconfig['epochs']*len(training_loader)

optimizer_params = fetch_optimizer(model, 'a')
optimizer = torch.optim.AdamW(optimizer_params, lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = int(num_steps*0.01),
                                            num_training_steps = num_steps)

# scheduler = get_cosine_schedule_with_warmup(
#         optimizer,
#         num_warmup_steps=int(0.1 * num_steps),
#         num_training_steps=num_steps,
#         num_cycles=1,
#         last_epoch=-1,
#     )

train_text[['id']].to_csv(zconfig['output']+'train_seq.csv')
valid_text[['id']].to_csv(zconfig['output']+'valid_seq.csv')

for epoch in range(zconfig['epochs']):
    train_loss, train_score, criterion = model_train(epoch)
    valid_loss, valid_score = evaluate(epoch, criterion)
    print(f"<----- Epoch: {epoch + 1} -- Train Loss: {train_loss} -- Acc: {train_score} -- Valid Loss: {valid_loss} -- Acc: {valid_score} ----->\n")

    checkpoint = {
            'epoch': epoch + 1,
            'train_loss_min': train_loss,
            'train_f1_score': train_score,
            'valid_loss_min': valid_loss,
            'valid_f1_score': valid_score,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        
    torch.save(checkpoint, zconfig['output']+'top_long_checkpoint_'+str(epoch + 1)+'.pt')
    torch.cuda.empty_cache()
    gc.collect()

Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 3509/3509 [52:21<00:00,  1.12it/s]
100%|██████████| 390/390 [02:47<00:00,  2.32it/s]


<----- Epoch: 1 -- Train Loss: 0.8518134590294599 -- Acc: 0.7263349241439782 -- Valid Loss: 0.6739014469660245 -- Acc: 0.7779366462398822 ----->



100%|██████████| 3509/3509 [52:22<00:00,  1.12it/s]
100%|██████████| 390/390 [02:48<00:00,  2.32it/s]


<----- Epoch: 2 -- Train Loss: 0.6211806846568645 -- Acc: 0.7885201056025821 -- Valid Loss: 0.6238593370104447 -- Acc: 0.787890166828871 ----->



100%|██████████| 3509/3509 [52:21<00:00,  1.12it/s]
100%|██████████| 390/390 [02:48<00:00,  2.32it/s]


<----- Epoch: 3 -- Train Loss: 0.5583209442460458 -- Acc: 0.8064454969185828 -- Valid Loss: 0.6360332286510713 -- Acc: 0.7867473337819643 ----->



100%|██████████| 3509/3509 [52:35<00:00,  1.11it/s]
100%|██████████| 390/390 [02:48<00:00,  2.31it/s]


<----- Epoch: 4 -- Train Loss: 0.5286458739190741 -- Acc: 0.8162314366570053 -- Valid Loss: 0.6609075780098255 -- Acc: 0.7857436479702233 ----->



 42%|████▏     | 1475/3509 [21:56<30:47,  1.10it/s]

In [None]:
model = FeedbackModel()
model.to(zconfig['device'])
idtx = 0
for param in model.named_parameters():
    print(param[0], idtx)
    idtx += 1

In [None]:
checkpoint = torch.load(zconfig['output']+'checkpoints/checkpoint_20.pt')
model.load_state_dict(checkpoint['state_dict'])

In [9]:
def calc_overlap(row):
    """
    calculate the overlap between prediction and ground truth
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # length of each end intersection
    len_pred = len(set_pred)
    len_gt = len(set_gt)
    intersection = len(set_gt.intersection(set_pred))
    overlap_1 = intersection / len_gt
    overlap_2 = intersection / len_pred
    return [overlap_1, overlap_2]

def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id', 'discourse_type', 'predictionstring']].reset_index(drop = True).copy()
    pred_df = pred_df[['id', 'class', 'predictionstring']].reset_index(drop = True).copy()
    gt_df['gt_id'] = gt_df.index
    pred_df['pred_id'] = pred_df.index
    joined = pred_df.merge(
        gt_df,
        left_on = ['id', 'class'],
        right_on = ['id', 'discourse_type'],
        how = 'outer',
        suffixes = ['_pred', '_gt']
    )
    joined['predictionstring_gt'] =  joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] =  joined['predictionstring_pred'].fillna(' ')
    joined['overlaps'] = joined.apply(calc_overlap, axis = 1)
    # overlap over 0.5: true positive
    # If nultiple overlaps exists, the higher is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])

    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1', 'overlap2']].max(axis = 1)
    tp_pred_ids = joined.query('potential_TP').sort_values('max_overlap', ascending = False)\
                  .groupby(['id', 'predictionstring_gt']).first()['pred_id'].values
    
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]
    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    macro_f1_score = TP / (TP + 1/2 * (FP + FN))
    return macro_f1_score

def oof_score(df_val, oof):
    f1score = []
    classes = ['Lead', 'Position','Claim', 'Counterclaim', 'Rebuttal','Evidence','Concluding Statement']
    res = {}
    for c in classes:
        pred_df = oof.loc[oof['class'] == c].copy()
        gt_df = df_val.loc[df_val['discourse_type'] == c].copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        res[c] = round(f1, 4)
        # print(f'{c:<10}: {f1:4f}')
        f1score.append(f1)
    f1avg = np.mean(f1score)
    return f1avg, res

In [10]:
def link_evidence(oof):
  if not len(oof):
    return oof
  
  def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])
  
  thresh = 1
  idu = oof['id'].unique()
  eoof = oof[oof['class'] == "Evidence"]
  neoof = oof[oof['class'] != "Evidence"]
  eoof.index = eoof[['id', 'class']]
  for thresh2 in range(26, 27, 1):
    retval = []
    for idv in tqdm(idu, desc='link_evidence', leave=False):
      for c in ['Evidence']:
        q = eoof[(eoof['id'] == idv)]
        if len(q) == 0:
          continue
        pst = []
        for r in q.itertuples():
          pst = [*pst, -1,  *[int(x) for x in r.predictionstring.split()]]
        start = 1
        end = 1
        for i in range(2, len(pst)):
          cur = pst[i]
          end = i
          if  ((cur == -1) and ((pst[i + 1] > pst[end - 1] + thresh) or (pst[i + 1] - pst[start] > thresh2))):
            retval.append((idv, c, jn(pst, start, end)))
            start = i + 1
        v = (idv, c, jn(pst, start, end + 1))
        retval.append(v)
    roof = pd.DataFrame(retval, columns=['id', 'class', 'predictionstring'])
    roof = roof.merge(neoof, how='outer')
    return roof

In [11]:
def post_process_pred(df, all_preds, all_preds_prob, MIN_THRESH_F, PROB_THRESH_F):
    final_preds = []
    for i in range(len(df)):
        idx = df.id.values[i]
        pred = all_preds[i]
        pred_prob = all_preds_prob[i]
        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == 'O': j += 1
            else: cls = cls.replace('B', 'I')
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            if cls != 'O' and cls !='':
                avg_score = np.mean(pred_prob[j:end])
                if end - j > MIN_THRESH[cls] and avg_score > PROB_THRESH[cls]:
                    final_preds.append((idx, cls.replace('I-', ''), ' '.join(map(str, list(range(j, end))))))
            j = end
    df_pred = pd.DataFrame(final_preds)
    df_pred.columns = ['id', 'class', 'predictionstring']
    df_pred = link_evidence(df_pred)
    return df_pred

In [12]:
def preds_class_prob(all_logits, word_id):
    print("predict target class and its probabilty")
    final_predictions = []
    final_predictions_score = []

    for logits, word_ids in zip(all_logits, word_id):
        predictions =[]
        predictions_prob = []
        pred_class_id = np.argmax(logits, axis=1)
        pred_score = np.max(logits, axis=1)
        pred_class_labels = [IDS_TO_LABELS[i] for i in pred_class_id]
        prev_word_idx = -1
        for idx, word_idx in enumerate(word_ids):
            if word_idx == -1:
                pass
            elif word_idx != prev_word_idx:
                predictions.append(pred_class_labels[idx])
                predictions_prob.append(pred_score[idx])
                prev_word_idx = word_idx
        final_predictions.append(predictions)
        final_predictions_score.append(predictions_prob)
    return final_predictions, final_predictions_score

In [13]:
def inference(model, data_set):
    final_predictions = []
    final_predictions_prob = []
    model.eval()
    
    valid_loss = 0
    valid_accuracy = 0
    all_logits = []
    word_id = []
    batch_idx = 0
    criterion = torch.nn.CrossEntropyLoss()
    for batch in tqdm(data_set):
        ids = batch['input_ids'].to('cuda', dtype = torch.long)
        mask = batch['attention_mask'].to('cuda', dtype = torch.long)
        with torch.no_grad():
            raw_logits = model(input_ids=ids, mask = mask)
        del ids, mask
        
        word_ids = batch['word_ids'].to('cuda', dtype = torch.long)   
        raw_labels = batch['labels'].to('cuda', dtype = torch.long)
        logits = active_logits(raw_logits, word_ids)
        labels = active_labels(raw_labels)
        preds, preds_prob = active_preds_prob(logits)
        valid_accuracy += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
        loss = criterion(logits, labels)
        valid_loss += loss.item()

        all_logits.extend(raw_logits.cpu().numpy())
        word_id.extend(word_ids.cpu().numpy())
        
        batch_idx += 1

    epoch_loss = valid_loss / len(data_set)
    epoch_accuracy = valid_accuracy / len(data_set)
    return all_logits, word_id, epoch_loss, epoch_accuracy

In [19]:
tokenizer = AutoTokenizer.from_pretrained(zconfig['model_name'], add_prefix_space=True)

train_comb_ids = pd.read_csv(zconfig['output']+'train_seq.csv')
valid_comb_ids = pd.read_csv(zconfig['output']+'valid_seq.csv')

train_comb_ids.drop(columns = ['Unnamed: 0'], inplace = True)
valid_comb_ids.drop(columns = ['Unnamed: 0'], inplace = True)

train_data = train[train['id'].isin(train_comb_ids['id'].values)].reset_index(drop = True)
valid_data = train[train['id'].isin(valid_comb_ids['id'].values)].reset_index(drop = True)
train_text = train_data.groupby(['id']).agg({'paragraph':'first'})
train_text.reset_index(inplace=True)
valid_text = valid_data.groupby(['id']).agg({'paragraph':'first'})
valid_text.reset_index(inplace=True)

train_text = ner(train_text, train_data)
valid_text = ner(valid_text, valid_data)
    
training_set = dataset(train_text, tokenizer, zconfig['max_len'], True)
testing_set = dataset(valid_text, tokenizer, zconfig['max_len'], True)

collate = Collate(tokenizer)

train_params = {'batch_size': zconfig['batch'],
                'shuffle': False,
                'collate_fn' : collate
                }

test_params = {'batch_size': zconfig['batch'],
                'shuffle': False,
            'collate_fn' : collate
                }
                
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

model_filename = zconfig['output']+'top_long_checkpoint_2.pt'
model = FeedbackModel()
model = model.to('cuda')
checkpoint = torch.load(model_filename)
model.load_state_dict(checkpoint['state_dict'])
train_logits, train_word_id, train_loss, train_acc = inference(model, training_loader)
valid_logits, valid_word_id, valid_loss, valid_acc = inference(model, testing_loader)
train_preds, train_preds_prob = preds_class_prob(train_logits, train_word_id)
valid_preds, valid_preds_prob = preds_class_prob(valid_logits, valid_word_id)

100%|██████████| 14034/14034 [02:14<00:00, 104.29it/s]
100%|██████████| 1560/1560 [00:04<00:00, 351.96it/s]


Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 3509/3509 [45:24<00:00,  1.29it/s]
100%|██████████| 390/390 [05:00<00:00,  1.30it/s]


predict target class and its probabilty
predict target class and its probabilty


In [20]:
train_correct = []
for i in zip(train_preds):
    i = [str(k) for k in i]
    train_correct.append('#'.join(i))

valid_correct = []
for i in zip(valid_preds):
    i = [str(k) for k in i]
    valid_correct.append('#'.join(i))

pd.DataFrame(train_correct).to_csv(zconfig['output']+'train_pred.csv')
pd.DataFrame(valid_correct).to_csv(zconfig['output']+'valid_pred.csv')
train_text.to_csv(zconfig['output']+'train_text.csv')
valid_text.to_csv(zconfig['output']+'valid_text.csv')

In [None]:
thres_list = np.arange(0.10, 0.90, 0.05)
min_th = np.arange(3,21,1)
results_list = []
mx_f1 = 0.64
for _ in tqdm(range(1000)):
    
    PROB_THRESH = {
        "I-Lead": round(random.choice(thres_list), 2),
        "I-Position": round(random.choice(thres_list), 2),
        "I-Evidence": round(random.choice(thres_list), 2),
        "I-Claim": round(random.choice(thres_list), 2),
        "I-Concluding Statement": round(random.choice(thres_list), 2),
        "I-Counterclaim": round(random.choice(thres_list), 2),
        "I-Rebuttal": round(random.choice(thres_list), 2),
    }

    MIN_THRESH = {
    "I-Lead": random.choice(min_th),
    "I-Position": random.choice(min_th),
    "I-Evidence": random.choice(min_th),
    "I-Claim": random.choice(min_th),
    "I-Concluding Statement": random.choice(min_th),
    "I-Counterclaim": random.choice(min_th),
    "I-Rebuttal": random.choice(min_th),
    }

    # print(PROB_THRESH)
    print('')
    df_pred = post_process_pred(valid_text, all_preds, all_preds_prob, MIN_THRESH, PROB_THRESH)
    ovr_f1, res = oof_score(valid_data, df_pred)
    # print(f'Overall F1 Score {ovr_f1}')
    if ovr_f1 > mx_f1:
        mx_f1 = ovr_f1
        print(f'New Max F1: {mx_f1}')

    final_dict = {'m_claim':MIN_THRESH["I-Claim"], 'm_lead':MIN_THRESH["I-Lead"], 'm_evidence':MIN_THRESH["I-Evidence"], 'm_position':MIN_THRESH["I-Position"], 
                  'm_rebuttal':MIN_THRESH["I-Rebuttal"], 'm_counterclaim':MIN_THRESH["I-Counterclaim"], 'm_concluding_statement':MIN_THRESH["I-Concluding Statement"], 
                  'p_claim':PROB_THRESH["I-Claim"], 'p_lead':PROB_THRESH["I-Lead"], 'p_evidence':PROB_THRESH["I-Evidence"], 'p_position':PROB_THRESH["I-Position"], 
                  'p_rebuttal':PROB_THRESH["I-Rebuttal"], 'p_counterclaim':PROB_THRESH["I-Counterclaim"], 'p_concluding_statement':PROB_THRESH["I-Concluding Statement"],
                  's_claim':res["Claim"], 's_lead':res["Lead"], 's_evidence':res["Evidence"], 's_position':res["Position"], 
                  's_rebuttal':res["Rebuttal"], 's_counterclaim':res["Counterclaim"], 's_concluding_statement':res["Concluding Statement"],
                  'overall':ovr_f1}
    results_list.append(final_dict)
    res_df = pd.DataFrame.from_dict(results_list, orient='columns')
    res_df.to_csv(zconfig['output']+'ex_results.csv')

In [None]:
df_pred = post_process_pred(valid_text, all_preds, all_preds_prob, MIN_THRESH, PROB_THRESH)
ovr_f1, res = oof_score(valid_data, df_pred)



In [None]:
res

{'Claim': 0.5913,
 'Concluding Statement': 0.854,
 'Counterclaim': 0.5578,
 'Evidence': 0.7146,
 'Lead': 0.8529,
 'Position': 0.7223,
 'Rebuttal': 0.434}

In [None]:
ovr_f1

0.6752806461116914

In [None]:
res

{'Claim': 0.5913,
 'Concluding Statement': 0.854,
 'Counterclaim': 0.5578,
 'Evidence': 0.7168,
 'Lead': 0.8529,
 'Position': 0.7223,
 'Rebuttal': 0.434}

In [None]:
ovr_f1

0.675592081513087