In [None]:
import gc
gc.enable()

import os
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import AutoTokenizer, AutoConfig, TFAutoModel
print('TF version',tf.__version__)


In [None]:
# DECLARE HOW MANY GPUS YOU WISH TO USE. 
# KAGGLE ONLY HAS 1, BUT OFFLINE, YOU CAN USE MORE
os.environ["CUDA_VISIBLE_DEVICES"]="0" #0,1,2,3 for four gpu

# VERSION FOR SAVING/LOADING MODEL WEIGHTS
VER=14 
# IF VARIABLE IS NONE, THEN NOTEBOOK COMPUTES TOKENS
# OTHERWISE NOTEBOOK LOADS TOKENS FROM PATH
LOAD_TOKENS_FROM = '../input/tf-longformer-v12'
# IF FOLLOWING IS NONE, THEN NOTEBOOK 
# USES INTERNET AND DOWNLOADS HUGGINGFACE 
# CONFIG, TOKENIZER, AND MODEL
DOWNLOADED_MODEL_PATH = '../input/tf-longformer-v12'
# IF VARIABLE IS NONE, THEN NOTEBOOK TRAINS A NEW MODEL
# OTHERWISE IT LOADS YOUR PREVIOUSLY TRAINED MODEL
LOAD_MODEL_FROM = '../input/tflongformerv14'
# https://huggingface.co/allenai/longformer-base-4096
MODEL_NAME = 'allenai/longformer-base-4096'

In [None]:
class config:
    MAX_LEN = 1024
    # LEARNING RATE SCHEDULE AND MODEL CHECKPOINT
    EPOCHS = 5
    BATCH_SIZE = 4 
    LRS = [0.25e-4, 0.25e-4, 0.25e-4, 0.25e-4, 0.25e-5] 
    

In [None]:
# USE MULTIPLE GPUS
if os.environ["CUDA_VISIBLE_DEVICES"].count(',') == 0:
    strategy = tf.distribute.get_strategy()
    print('single strategy')
else:
    strategy = tf.distribute.MirroredStrategy()
    print('multiple strategy')

In [None]:
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
print('Mixed precision enabled')

# Training Dataset

**id** - ID code for essay response

**discourse_id** - ID code for discourse element

**discourse_start** - character position where discourse element begins in the essay response

**discourse_end** - character position where discourse element ends in the essay response

**discourse_text** - text of discourse element

**discourse_type** - classification of discourse element

**discourse_type_num** - enumerated class label of discourse element

**predictionstring** - the word indices of the training sample, as required for predictions

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')
train_df.head()

In [None]:
print('The train labels are:')
train_df.discourse_type.unique()

In [None]:
IDS = train_df.id.unique()
ids_len = len(IDS)
print(f"The number of Train Text : {ids_len}")

In [None]:
# The tokens and attention arrays
tokenizer = AutoTokenizer.from_pretrained(LOAD_TOKENS_FROM)
train_tokens = np.zeros((ids_len, config.MAX_LEN), dtype='int32')
train_attentions = np.zeros((ids_len, config.MAX_LEN), dtype='int32')

In [None]:
# The 14 classes for NER with Begin and Inside Tags
lead_b = np.zeros((ids_len, config.MAX_LEN))
lead_i = np.zeros((ids_len, config.MAX_LEN))

position_b = np.zeros((ids_len, config.MAX_LEN))
position_i = np.zeros((ids_len, config.MAX_LEN))

evidence_b = np.zeros((ids_len, config.MAX_LEN))
evidence_i = np.zeros((ids_len, config.MAX_LEN))

claim_b = np.zeros((ids_len, config.MAX_LEN))
claim_i = np.zeros((ids_len, config.MAX_LEN))

conclusion_b = np.zeros((ids_len, config.MAX_LEN))
conclusion_i = np.zeros((ids_len, config.MAX_LEN))

counterclaim_b = np.zeros((ids_len, config.MAX_LEN))
counterclaim_i = np.zeros((ids_len, config.MAX_LEN))

rebuttal_b = np.zeros((ids_len, config.MAX_LEN))
rebuttal_i = np.zeros((ids_len, config.MAX_LEN))

In [None]:
# Helper variables
targets_b = [lead_b, position_b, evidence_b, claim_b, conclusion_b, counterclaim_b, rebuttal_b]
targets_i = [lead_i, position_i, evidence_i, claim_i, conclusion_i, counterclaim_i, rebuttal_i]
target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4,
             'Counterclaim':5, 'Rebuttal':6}

del lead_b, position_b, evidence_b, claim_b, conclusion_b, counterclaim_b, rebuttal_b
del lead_i, position_i, evidence_i, claim_i, conclusion_i, counterclaim_i, rebuttal_i

In [None]:
# Assumming all value in discourse_start is ascending
assert(np.sum(train_df.groupby('id')['discourse_start'].diff()<=0)==0)

In [None]:
def target_arrays(ind, name):  
    # Find Targets in text and save in Target Arrays
    offsets = tokens['offset_mapping']
    offset_index=0
    df = train_df.loc[train_df.id==name]
    for index,row in df.iterrows():
        # Index of Offset need be less than length of offsets 
        if offset_index>=config.MAX_LEN: #MAX_LEN = len(offsets)
            break
        a = row.discourse_start
        b = row.discourse_end
        c = offsets[offset_index][0] # char_start
        d = offsets[offset_index][1] # char_end

        beginning=True
        while b > c:
            if(c>=a)&(b>=d): # word in offset inside discourse start/end
                k = target_map[row.discourse_type]
                if beginning:
                    targets_b[k][ind][offset_index] = 1
                    beginning=False
                else:
                    targets_i[k][ind][offset_index] = 1
            offset_index += 1
            if offset_index>=config.MAX_LEN: # MAX_LEN = len(offsets)
                break
            c = offsets[offset_index][0]
            d = offsets[offset_index][1]

In [None]:
def get_tokens(data_name, file_name):
    # Read training text, tokenize and save in token arrays
    file_path = f"../input/feedback-prize-2021/{data_name}/{file_name}.txt"
    txt = open(file_path,'r').read()
    # Tokenization
    tokens = tokenizer.encode_plus(txt, 
                               max_length=config.MAX_LEN, 
                               padding='max_length', 
                               truncation=True, 
                               return_offsets_mapping=True), # Whether or not to return (char_start, char_end) for each token.
    return tokens[0]

In [None]:
if LOAD_TOKENS_FROM is None:
    # FULL Training text
    for id_num in range(ids_len):
        file_name = IDS[id_num]
        # TOKEN ARRAYS
        tokens = get_tokens(data_name='train', 
                            file_name=file_name)
        train_tokens[id_num,] = tokens['input_ids']
        train_attentions[id_num,] = tokens['attention_mask']
        # FILLING TARGET ARRAYS
        target_arrays(id_num, file_name)
    # FILLING targets    
    targets = np.zeros((len(IDS),config.MAX_LEN,15), dtype='int32')
    for k in range(7):
        targets[:,:,2*k] = targets_b[k]
        targets[:,:,2*k+1] = targets_i[k]
    targets[:,:,14] = 1-np.max(targets,axis=-1)
    del targets_b, targets_i

In [None]:
if LOAD_TOKENS_FROM is None:
    np.save(f'targets_{config.MAX_LEN}', targets)
    np.save(f'tokens_{config.MAX_LEN}', train_tokens)
    np.save(f'attention_{config.MAX_LEN}', train_attention)
    print('Saved NER tokens')
else:
    targets = np.load(f'{LOAD_TOKENS_FROM}/targets_{config.MAX_LEN}.npy')
    train_tokens = np.load(f'{LOAD_TOKENS_FROM}/tokens_{config.MAX_LEN}.npy')
    train_attentions = np.load(f'{LOAD_TOKENS_FROM}/attention_{config.MAX_LEN}.npy')
    print('Loaded NER tokens')

# Build Model

In [None]:
def build_model():
    tokens = Input(shape=(config.MAX_LEN,), name='Tokens',dtype=tf.int32)
    attention = Input(shape=(config.MAX_LEN,), name='Attentions', dtype=tf.int32)
    
    configModel = AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH+'/config.json')
    transformer = TFAutoModel.from_pretrained(DOWNLOADED_MODEL_PATH+'/tf_model.h5', config=configModel)
    
    x = transformer(tokens, attention_mask=attention)
    x = Dense(256, activation='relu')(x[0]) # final hidden activations
    x = Dense(15, activation='softmax', dtype='float32')(x)
    
    model = tf.keras.Model(inputs=[tokens, attention], outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss=[tf.keras.losses.CategoricalCrossentropy()],
                  metrics=[tf.keras.metrics.CategoricalAccuracy()])
    
    model.summary()
    
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
tf.keras.utils.plot_model(model)

# Training Model

In [None]:
# Learning Rate function
def lrfn(epoch):
    return config.LRS[epoch]
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

In [None]:
np.random.seed(319)
train_ids = np.random.choice(np.arange(ids_len), int(0.9*ids_len), replace=False)
val_ids = np.setdiff1d(np.arange(ids_len), train_ids)
print('Train Size:', len(train_ids), ',Valid Size:', len(val_ids))

In [None]:
# LOAD MODEL
if LOAD_MODEL_FROM:
    model.load_weights(f'{LOAD_MODEL_FROM}/long_v{VER}.h5')
    
# OR TRAIN MODEL
else:# Training Model
    model.fit(x=[train_tokens[train_ids,], train_attentions[train_ids,]],
              y=targets[train_ids,],
              validation_data=([train_tokens[val_ids,], train_attentions[val_ids,]],targets[val_ids,]),
              callbacks=[lr_callback],
              epochs=config.EPOCHS,
              batch_size=config.BATCH_SIZE,
              verbose=2)

    # SAVE MODEL WEIGHTS
    model.save_weights(f'long_v{VER}.h5')

# out-of-fold (OOF) predictions

In [None]:
p = model.predict([train_tokens[val_ids,], train_attentions[val_ids,]],
                  batch_size=16,
                  verbose=2)
print('OOF predictions shape:',p.shape)
oof_preds = np.argmax(p,axis=-1)

In [None]:
target_map_rev = {0:'Lead', 1:'Position', 2:'Evidence', 3:'Claim', 4:'Concluding Statement',
             5:'Counterclaim', 6:'Rebuttal', 7:'blank'}

In [None]:
# GET ID function
def get_id(txt):
    # TOKENIZATION
    tokens = tokenizer.encode_plus(txt, 
                                   max_length=config.MAX_LEN, 
                                   padding='max_length',
                                   truncation=True, 
                                   return_offsets_mapping=True)
    off = tokens['offset_mapping']

    # GET WORD POSITIONS IN CHARS
    w = []
    blank = True
    for i in range(len(txt)):
        charac = txt[i]
        cond1 = (charac!=' ')&(charac!='\n')&(charac!='\xa0')&(charac!='\x85')&(blank==True)
        cond2 = (charac==' ')|(charac=='\n')|(charac=='\xa0')|(charac=='\x85')
        if cond1:
            w.append(i)
            blank=False
        elif cond2:
            blank=True
    w.append(1e6)
    return w, off

In [None]:
def get_preds(data_name, preds, text_ids):
    all_predictions = []
    for ind in range(len(preds)):
        # GET ID
        name = text_ids[ind]
        # GET TOKEN POSITIONS IN CHARS
        name_path = f'../input/feedback-prize-2021/{data_name}/{name}.txt'
        text = open(name_path,'r').read()
        # GET TOKENS POSITIONS IN CHARS
        ws, offs = get_id(text)
        # MAPPING FROM TOKENS TO WORDS
        # CONVERT TOKEN PREDICTIONS INTO WORD LABELS
        ### KEY: ###
        # 0: LEAD_B, 1: LEAD_I
        # 2: POSITION_B, 3: POSITION_I
        # 4: EVIDENCE_B, 5: EVIDENCE_I
        # 6: CLAIM_B, 7: CLAIM_I
        # 8: CONCLUSION_B, 9: CONCLUSION_I
        # 10: COUNTERCLAIM_B, 11: COUNTERCLAIM_I
        # 12: REBUTTAL_B, 13: REBUTTAL_I
        # 14: NOTHING i.e. O
        ### NOTE THESE VALUES ARE DIVIDED BY 2 IN NEXT CODE LINE
        pred = preds[ind,]/2.0
        # MAPPING FROM TOKENS TO WORDS
        word_map = -1 * np.ones(config.MAX_LEN,dtype='int32')
        w_i = 0
        for i in range(len(offs)):
            if offs[i][1]==0: continue
            while offs[i][0]>=ws[w_i+1]: w_i += 1
            word_map[i] = int(w_i)
        i = 0
        while i<config.MAX_LEN:
            prediction = []
            start = pred[i]
            if start in [0,1,2,3,4,5,6,7]:
                prediction.append(word_map[i])
                i += 1
                if i>=config.MAX_LEN: break
                while pred[i]==start+0.5:
                    if not word_map[i] in prediction:
                        prediction.append(word_map[i])
                    i += 1
                    if i>=config.MAX_LEN: break
            else:
                i += 1
            prediction = [x for x in prediction if x!=-1]
            if len(prediction)>4:
                res = (name, target_map_rev[int(start)], ' '.join([str(x) for x in prediction]) )
                all_predictions.append(res)    
    # MAKE DATAFRAME
    df = pd.DataFrame(all_predictions)
    df.columns = ['id','class','predictionstring']
                
    return df

In [None]:
# MAKE DATAFRAME
oof = get_preds(data_name ='train',
                preds=oof_preds, 
                text_ids=IDS[val_ids])

In [None]:
oof.head()

# Compute Validation Metric

In [None]:
# CODE FROM : Rob Mulla @robikscube
# https://www.kaggle.com/robikscube/student-writing-competition-twitch
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter/ len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id','discourse_type','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df = pred_df[['id','class','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id','class'],
                           right_on=['id','discourse_type'],
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])


    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    #calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score

In [None]:
# VALID DATAFRAME
valid = train_df.loc[train_df['id'].isin(IDS[val_ids])]

In [None]:
f1s = []
CLASSES = oof['class'].unique()
for c in CLASSES:
    pred_df = oof.loc[oof['class']==c].copy()
    gt_df = valid.loc[valid['discourse_type']==c].copy()
    f1 = score_feedback_comp(pred_df, gt_df)
    print(c,f1)
    f1s.append(f1)
print()
print('Overall',np.mean(f1s))

# Infer Test Data

In [None]:
# GET TEST TEXT IDS
files = os.listdir('../input/feedback-prize-2021/test')
TEST_IDS = [f.replace('.txt','') for f in files if 'txt' in f]
test_len = len(TEST_IDS)
print('There are',test_len,'test texts.')

In [None]:
# CONVERT TEST TEXT TO TOKENS
test_tokens = np.zeros((test_len, config.MAX_LEN), dtype='int32')
test_attentions = np.zeros((test_len, config.MAX_LEN), dtype='int32')

for id_num in range(test_len):
   # READ TRAIN TEXT, TOKENIZE AND SAVE IN TOKEN ARRAYS
    name = TEST_IDS[id_num]
    tokens = get_tokens(data_name='test',
                             file_name=name)
    test_tokens[id_num,]= tokens['input_ids']
    test_attentions[id_num,] = tokens['attention_mask']

In [None]:
# INFER TEST TEXTS
p = model.predict([test_tokens, test_attentions], batch_size=16, verbose=2)
print('Test predictions shape:',p.shape)
test_preds = np.argmax(p,axis=-1)

# THRESHOLD
To remove the data which is less than 2% in word length 

In [None]:
oof['len'] =oof['predictionstring'].apply(lambda x: len(x.split()))
train_df['len'] = train_df['predictionstring'].apply(lambda x: len(x.split()))
# Describe in Percentile
train_df.groupby('discourse_type')['len'].describe(percentiles=[0.02])

In [None]:
def threshold_2percent(df):
    df = oof.copy()
    # Create a dictionary with threshold less than 2% 
    map_clip = {'Lead':9, 'Position':5, 'Evidence':14, 'Claim':3, 'Concluding Statement':11,
             'Counterclaim':6, 'Rebuttal':4}
    for key, value in map_clip.items():
        index = df.loc[df['class']==key].query(f'len<{value}').index
        df.drop(index, inplace=True)
    return df

In [None]:
oof2 = threshold_2percent(oof)
f1s = []
CLASSES = oof2['class'].unique()
for c in CLASSES:
    pred_df = oof2.loc[oof2['class']==c].copy()
    gt_df = valid.loc[valid['discourse_type']==c].copy()
    f1 = score_feedback_comp(pred_df, gt_df)
    print(c,f1)
    f1s.append(f1)
print()
print('Overall',np.mean(f1s))

# Submission

In [None]:
sub = get_preds(data_name='test', 
                preds=test_preds, 
                text_ids=TEST_IDS)
sub.to_csv('submission.csv', index=False)
sub.head()

# Reference

https://www.kaggle.com/phanttan/bert-distilbert-fine-tune

https://www.kaggle.com/phanttan/student-writing-competition-twitch-stream

https://www.kaggle.com/vuxxxx/tensorflow-longformer-ner-postprocessing