# Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf

from transformers import *

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')

print(f'Shape of Train : {train_df.shape}')
train_df.head()

In [None]:
IDS = train_df['id'].unique()
LABELS = train_df['discourse_type'].unique()

print(f'Count of IDs : {len(IDS)}')
print(f'Labels : {LABELS}')

# Tokenize Train

In [None]:
MAX_LEN = 1024
PRE_PATH = '../input/allenailongformerbase4096'

tokenizer = AutoTokenizer.from_pretrained(PRE_PATH)

train_tokens = np.zeros((len(IDS), MAX_LEN), dtype='int32')
train_attention = np.zeros((len(IDS), MAX_LEN), dtype='int32')

# the 14 classes for NER
lead_b = np.zeros((len(IDS),MAX_LEN))
lead_i = np.zeros((len(IDS),MAX_LEN))

position_b = np.zeros((len(IDS),MAX_LEN))
position_i = np.zeros((len(IDS),MAX_LEN))

evidence_b = np.zeros((len(IDS),MAX_LEN))
evidence_i = np.zeros((len(IDS),MAX_LEN))

claim_b = np.zeros((len(IDS),MAX_LEN))
claim_i = np.zeros((len(IDS),MAX_LEN))

conclusion_b = np.zeros((len(IDS),MAX_LEN))
conclusion_i = np.zeros((len(IDS),MAX_LEN))

counterclaim_b = np.zeros((len(IDS),MAX_LEN))
counterclaim_i = np.zeros((len(IDS),MAX_LEN))

rebuttal_b = np.zeros((len(IDS),MAX_LEN))
rebuttal_i = np.zeros((len(IDS),MAX_LEN))

In [None]:
train_lens = []
targets_b = [lead_b, position_b, evidence_b, claim_b, conclusion_b, counterclaim_b, rebuttal_b]
targets_i = [lead_i, position_i, evidence_i, claim_i, conclusion_i, counterclaim_i, rebuttal_i]
target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4,
             'Counterclaim':5, 'Rebuttal':6}

# Create Token
for i in range(len(IDS)):
    
    doc_id = IDS[i]
    
    doc_file = f'../input/feedback-prize-2021/train/{doc_id}.txt'
    
    doc_txt = open(doc_file, 'r').read()
    
    train_lens.append(len(doc_txt.split()))
    
    tokens = tokenizer.encode_plus(doc_txt,
                                  max_length=MAX_LEN,
                                  padding='max_length',
                                  truncation=True,
                                  return_offsets_mapping=True)
    
    train_tokens[i, ] = tokens['input_ids']
    train_attention[i, ] = tokens['attention_mask']
    
    # find targets in text and save in target arrays
    # offset_mappings are maps from tokens to the original texts
    offsets = tokens['offset_mapping']
    offset_index = 0
    
    doc_info_df = train_df[train_df['id']==doc_id]
    
    for index, row in doc_info_df.iterrows():
        
        a = row['discourse_start']
        b = row['discourse_end']
        
        if offset_index > len(offsets)-1 :
            break
        
        c = offsets[offset_index][0]
        d = offsets[offset_index][1]
        
        beginning = True
        
        while b > c :
            if (c >= a) & (b >= d):
                target_num = target_map[row['discourse_type']]
                if beginning:
                    targets_b[target_num][i][offset_index] = 1
                    beginning = False
                else:
                    targets_i[target_num][i][offset_index] = 1
            
            offset_index += 1
            
            if offset_index > len(offsets)-1:
                break
            c = offsets[offset_index][0]
            d = offsets[offset_index][1]

In [None]:
# check tokens and attention
print(f'train_tokens \n{train_tokens}')
print('\n')
print(f'train_attention \n{train_attention}')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 5))

plt.hist(train_lens, bins=100)
plt.title('Histogram of Train word counts')
plt. xlabel('Train word count')
plt.show()

In [None]:
#  target array (15594, 1024, 15)
# 15 means 0 + 14 classes
targets = np.zeros((len(IDS), MAX_LEN, 15), dtype='int32')

for k in range(7):
    targets[:, :, 2*k] = targets_b[k]
    targets[:, :, 2*k+1] = targets_i[k]

targets[:, :, 14] = 1-np.max(targets, axis=-1)

In [None]:
# Save
np.save(f'targets_{MAX_LEN}', targets)
np.save(f'tokens_{MAX_LEN}', train_tokens)
np.save(f'attention_{MAX_LEN}', train_attention)

# Model
We will use LongFormer backbone and add our own NER head using one hidden layer of size 256 and one final layer with softmax.  
We use 15 classes because we have a B class and I class for each of 7 labels.  
And we have an additional class (called 0 class) for tokens that do not belong to one of 14 classes.

In [None]:
def build_model():
    
    tokens = tf.keras.layers.Input(shape=(MAX_LEN, ), name='tokens', dtype=tf.int32)
    attention = tf.keras.layers.Input(shape=(MAX_LEN, ), name='attention', dtype=tf.int32)
    
    config = AutoConfig.from_pretrained(PRE_PATH + '/config.json')
    backbone = TFAutoModel.from_pretrained(PRE_PATH + '/tf_model.h5', config=config)
    
    x = backbone(tokens, attention_mask=attention)     # LongFormer backbone
    x = tf.keras.layers.Dense(256, activation='relu')(x[0])     # NER head
    x = tf.keras.layers.Dense(15, activation='softmax', dtype='float32')(x)     # final softmax layer
    
    model = tf.keras.Model(inputs=[tokens, attention], outputs=x)
    
    model.compile(
                optimizer = tf.keras.optimizers.Adam(lr=1e-4),
                loss = [tf.keras.losses.CategoricalCrossentropy()],
                metrics = [tf.keras.metrics.CategoricalAccuracy()]
                )
    
    return model

In [None]:
model = build_model()

In [None]:
model.summary()

# Train Model
When training on Kaggle's 1xP100 GPU, we need to reduce the batch size to 4.  
And we reduce the learning rates to 0.25e-4 and 0.25e-5.  
Each training epoch on Kaggle takes 1 hour 8 minutes.

In [None]:
# learning rate schedule and model checkpoint
EPOCHS = 1
BATCH_SIZE = 4
LRS = [0.25e-4, 0.25e-4, 0.25e-4, 0.25e-4, 0.25e-5]

def lrfn(epoch):
    return LRS[epoch]

lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)

# train valid split 90 : 10
np.random.seed(42)
train_idx = np.random.choice(np.arange(len(IDS)), int(0.9*len(IDS)), replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)), train_idx)
np.random.seed(None)
print(f'Train size {len(train_idx)}')
print(f'Valid size {len(valid_idx)}')

In [None]:
# train model (3h:43)
model.fit(x = [train_tokens[train_idx, ], train_attention[train_idx, ]], 
         y = targets[train_idx, ], 
         validation_data = ([train_tokens[valid_idx, ], train_attention[valid_idx, ]],
                            targets[valid_idx, ]), 
         callbacks = [lr_callback],
         epochs = EPOCHS,
         batch_size = BATCH_SIZE,
         verbose = 2)

In [None]:
# save model weights
model.save_weights('longformer_v1.h5')

# Validation Model - Infer OOF
We will now make predictions on the validation texts.  
Our model makes label predictions for each token, we need to convert this into a list of word indices for each label.  
Note that the tokens and words are not the same.  
A single word may be broken into multiple tokens.  
Therefore we need to first create a map to change token indices to word indices.

In [None]:
p = model.predict([train_tokens[valid_idx], train_attention[valid_idx]],
                 batch_size=16, verbose=2)

print(f'OOF predictions shape : {p.shape}')
oof_preds = np.argmax(p, axis=-1)

In [None]:
target_map_rev = {0 : 'Lead', 1 : 'Position', 2 : 'Evidence', 3 : 'Claim', 
                 4 : 'Concluding Statement', 5 : 'Conterclaim', 6 : 'Rebuttal', 7 : 'blank'}

In [None]:
def get_preds(dataset='train', verbose=True, text_ids=IDS[valid_idx], preds=oof_preds):
    
    all_predictions = []
    
    for id_num in range(len(preds)):
        
        # get id
        n = text_ids[id_num]
        
        # get token positions in chars
        name = f'../input/feedback-prize-2021/{dataset}/{n}.txt'
        txt = open(name, 'r').read()
        tokens = tokenizer.encode_plus(txt, 
                                       max_length=MAX_LEN, 
                                      padding='max_length', 
                                      truncation=True, 
                                      return_offsets_mapping=True)
        off = tokens['offset_mapping']
        
        # get word positions in chars
        w = []
        blank = True
        for i in range(len(txt)):
            if (txt[i] != ' ')&(txt[i] != '\n')&(txt[i] != '\xa0')&(txt[i] != '\x85')&(blank==True):
                w.append(i)
                blank=False
            elif (txt[i] == ' ')|(txt[i] == '\n')|(txt[i] == '\xa0')|(txt[i] == '\x85'):
                blank=True
        w.append(1e6)
        
        # mapping from tokens to words
        word_map = -1 * np.ones(MAX_LEN, dtype='int32')
        w_i = 0
        for i in range(len(off)):
            if off[i][1]==0:     # attention
                continue
            while off[i][0] >= w[w_i+1]:
                w_i += 1
            word_map[i] = int(w_i)
            
        # convert token predicitons into word labels
        # 0: LEAD_B, 1: LEAD_I
        # 2: POSITION_B, 3: POSITION_I
        # 4: EVIDENCE_B, 5: EVIDENCE_I
        # 6: CLAIM_B, 7: CLAIM_I
        # 8: CONCLUSION_B, 9: CONCLUSION_I
        # 10: COUNTERCLAIM_B, 11: COUNTERCLAIM_I
        # 12: REBUTTAL_B, 13: REBUTTAL_I
        # 14: NOTHING
        # note these values are divided by 2 in next code line
        pred = preds[id_num, ] / 2.0
        
        i = 0
        while i < MAX_LEN:
            prediction = []
            start = pred[i]
            if start in [0,1,2,3,4,5,6,7]:
                prediction.append(word_map[i])
                i += 1
                if i >= MAX_LEN: 
                    break
                while pred[i]==start+0.5:
                    if not word_map[i] in prediction:
                        prediction.append(word_map[i])
                    i += 1
                    if i >= MAX_LEN:
                        break
            else:
                i += 1
            
            prediction = [x for x in prediction if x!=-1]
            if len(prediction)>4:
                all_predictions.append((n, target_map_rev[int(start)], ' '.join([str(x) for x in prediction])))
                
    df = pd.DataFrame(all_predictions)
    df.columns = ['id', 'class', 'predictionstring']
    
    return df
            

In [None]:
oof = get_preds( dataset='train', verbose=True, text_ids=IDS[valid_idx] )
oof.head()

In [None]:
# CODE FROM : Rob Mulla @robikscube
# https://www.kaggle.com/robikscube/student-writing-competition-twitch
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter/ len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id','discourse_type','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df = pred_df[['id','class','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id','class'],
                           right_on=['id','discourse_type'],
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])


    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    #calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score

In [None]:
# VALID DATAFRAME
valid = train_df.loc[train_df['id'].isin(IDS[valid_idx])]

In [None]:
f1s = []
CLASSES = oof['class'].unique()
for c in CLASSES:
    pred_df = oof.loc[oof['class']==c].copy()
    gt_df = valid.loc[valid['discourse_type']==c].copy()
    f1 = score_feedback_comp(pred_df, gt_df)
    print(c,f1)
    f1s.append(f1)
print()
print('Overall',np.mean(f1s))

# Inference

In [None]:
test_files = os.listdir('../input/feedback-prize-2021/test')
test_ids = [f.replace('.txt', '') for f in test_files if 'txt' in f]

test_tokens = np.zeros((len(test_ids), MAX_LEN), dtype='int32')
test_attention = np.zeros((len(test_ids), MAX_LEN), dtype='int32')

for i in range(len(test_ids)):
    
    doc_id = test_ids[i]
    
    doc_file = f'../input/feedback-prize-2021/test/{doc_id}.txt'
    
    doc_txt = open(doc_file, 'r').read()
    
    tokens = tokenizer.encode_plus(doc_txt,
                                  max_length=MAX_LEN,
                                  padding='max_length',
                                  truncation=True,
                                  return_offsets_mapping=True)
    
    test_tokens[i, ] = tokens['input_ids']
    test_attention[i, ] = tokens['attention_mask']

In [None]:
pred = model.predict([test_tokens, test_attention], 
                    batch_size=16, verbose=2)

test_preds = np.argmax(pred, axis=-1)

In [None]:
# submit
sub = get_preds(dataset='test', verbose=False, text_ids = test_ids, preds=test_preds)
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)