# Feedback prize - Inference
This notebook is used for making kaggle submittion. The final test result is the score on the Kaggle private leaderboard.

In [None]:
#import pakages
import numpy as np
import pandas as pd 
import os
import tensorflow as tf
from transformers import *

In [None]:
# Load data
data_df = pd.read_csv('../input/feedback-prize-2021/train.csv')

# All ID list
all_id = data_df.id.unique()

In [None]:
# Token file
load_tokens_from = '../input/longformerbase4096'

# Pretrained model
downloaded_model_path = '../input/longformerbase4096'

# NER target file
NER_target = '../input/ner-target-for-feedback-prize-competition'

# Max sequence length for model
max_len = 1024

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(downloaded_model_path)

In [None]:
def build_model():
    """
    Function to build and compile model
    """
    tokens = tf.keras.layers.Input(shape = (max_len,), name = 'tokens', dtype = tf.int32)
    attention = tf.keras.layers.Input(shape = (max_len,), name = 'attention', dtype = tf.int32)

    config = AutoConfig.from_pretrained(downloaded_model_path + '/config.json') 
    backbone = TFAutoModel.from_pretrained(downloaded_model_path + '/tf_model.h5', config = config)

    x = backbone(tokens, attention_mask = attention)
    x = tf.keras.layers.Dense(512, activation='relu')(x[0])
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(15, activation='softmax', dtype='float32')(x)

    model = tf.keras.Model(inputs = [tokens, attention], outputs = x)
    model.compile(optimizer = 'adam',
              loss = ['categorical_crossentropy'],
              metrics = ['categorical_accuracy'])
    
    return model

In [None]:
# Convert test text to tokens
files = os.listdir('../input/feedback-prize-2021/test')
test_ids = [f.replace('.txt', '') for f in files if 'txt' in f]

test_tokens = np.zeros((len(test_ids), max_len), dtype = 'int32')
test_attention = np.zeros((len(test_ids), max_len), dtype = 'int32')

for id_num in range(len(test_ids)):          
    n = test_ids[id_num]
    name = f'../input/feedback-prize-2021/test/{n}.txt'
    txt = open(name, 'r').read()
    tokens = tokenizer.encode_plus(txt, max_length = max_len, padding = 'max_length',
                                   truncation = True, return_offsets_mapping = True)
    test_tokens[id_num] = tokens['input_ids']
    test_attention[id_num] = tokens['attention_mask']

In [None]:
# If single model is used, then asign True. If ensemble is used, asign False.
single = False

In [None]:
if single:
    model = build_model()
    model.load_weights('../input/trained-model-for-feedback-prize-competition/final_fold5_0.636.h5')
    test_preds = np.zeros((len(test_ids), max_len, 15))
    test_preds = model.predict([test_tokens, test_attention], batch_size = 16, verbose = 1)
    test_preds = np.argmax(test_preds, axis = -1)
else:
    model1 = build_model()
    model1.load_weights('../input/trained-model-for-feedback-prize-competition/final_fold1_0.634.h5')
    model2 = build_model()
    model2.load_weights('../input/trained-model-for-feedback-prize-competition/final_fold2_0.631.h5')
    model3 = build_model()
    model3.load_weights('../input/trained-model-for-feedback-prize-competition/final_fold3_0.621.h5')
    model4 = build_model()
    model4.load_weights('../input/trained-model-for-feedback-prize-competition/final_fold4_0.619.h5')
    model5 = build_model()
    model5.load_weights('../input/trained-model-for-feedback-prize-competition/final_fold5_0.636.h5') 
    
    model_list = [model1, model2, model3, model4, model5]
    test_preds = np.zeros((len(test_ids), max_len, 15))
    
    for md in model_list:
        test_preds += md.predict([test_tokens, test_attention], batch_size = 16, verbose = 1)/5
    test_preds = np.argmax(test_preds, axis = -1)   

In [None]:
# Optimum thredshold
thredshold = {'Lead': 6, 'Position': 4, 'Evidence': 9, 'Claim': 1,
              'Concluding Statement': 8, 'Counterclaim': 5, 'Rebuttal': 4}

In [None]:
def get_preds(dataset, text_ids, preds, thredshold, augmented = None):
    
    """
    Made prediction and create result data frame
    
    Arguments:
    dataset -- name of folder contain text files: 'train' or 'test'
    text_ids -- array or list of essay id
    preds -- the array contain predicted class
    augmented -- name of augmented text folder: 'augmented_back', 'augmented_roberta', 'augmented_syn'
    
    Return:
    df -- result data frame
    """
    all_predictions = []
    target_map_rev = {0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement',
                      5: 'Counterclaim', 6: 'Rebuttal', 7: 'blank'}

    for id_num in range(len(preds)):

        n = text_ids[id_num] # Text id
    
        # Tokenize the text
        try:
            txt = open(f'../input/feedback-prize-2021/{dataset}/{n}.txt', 'r').read()
        except:
            txt = open(f'../input/data-augmented/{augmented}/{n}.txt', 'r').read()

        tokens = tokenizer.encode_plus(txt, max_length = max_len, padding = 'max_length',
                                   truncation = True, return_offsets_mapping = True)
        off = tokens['offset_mapping']
    
        # Get word position
        word_pos = []
        blank = True
        # The word must start with a symbol, and only the first symbol will be counted
        for i in range(len(txt)):
            if (not txt[i].isspace()) & (blank == True):
                word_pos.append(i)
                blank = False
            # implied that previous word ended
            elif txt[i].isspace():
                blank = True
        word_pos.append(1e6) # end
            
        # Mapping from tokens to words
        word_map = -1*np.ones(max_len, dtype = 'int32')
        w_i = 0
        for i in range(len(off)):
            if off[i][1] == 0: # Skip character with token 0
                continue
            #If token position is larger than word start position
            while off[i][0] >= word_pos[w_i + 1]: 
                w_i += 1
            word_map[i] = int(w_i)
        
        # Convert token predictions into word labels
        # 0: lead_b, 1: lead_i
        # 2: position_b, 3: position_i
        # 4: evidence_b, 5: evidence_i
        # 6: claim_b, 7: claim_i
        # 8: conclusion_b, 9: conclusion_i
        # 10: counterclaim_b, 11: counterclaim_i
        # 12: rebuttal_b, 13: rebuttal_i
        # 14: nothing (o)

        pred = preds[id_num]/2
        
        # If we see tokens I-X, I-Y, I-X -> change I-Y to I-X
        for j in range(1, len(pred) - 1):
            if pred[j - 1] == pred[j + 1] and pred[j - 1]%2 == 0.5 and pred[j] != pred[j - 1]:
                pred[j] = pred[j-1]
            
        # B-X, ? (not B), I-X -> change ? to I-X
        for j in range(1, len(pred) - 1):
            if pred[j - 1] in range(0, 7, 1) and pred[j + 1] == pred[j - 1] + 0.5 \
            and pred[j] != pred[j + 1] and pred[j] not in range(0, 7, 1):
                pred[j] = pred[j+1]  
        
        # If we see tokens I-X, O, I-X, change center token to the same for stated discourse types
        for j in range(1, len(pred) - 1):
            if pred[j - 1] in [k + 0.5 for k in range(7)] and pred[j - 1] == pred[j + 1] and pred[j] == 7:
                pred[j] = pred[j - 1]
        
        i = 0
        while i < max_len:
            prediction = []
            start = pred[i]
            # Only append if the class start with 'B'
            if start in range(0,7): 
                prediction.append(word_map[i])
                i += 1
                if i >= max_len:
                    break
                # When the class is 'I'
                while pred[i] == start + 0.5: 
                    if not word_map[i] in prediction:
                        prediction.append(word_map[i])
                    i += 1
                    if i >= max_len:
                        break
            else:
                i += 1
            
            prediction = [x for x in prediction if x != -1]
            
            
            # Skip blank classified word
            if start == 7:
                continue
            
            # Only accept if length of discourse larger than a thredshold
            discourse_type = target_map_rev[int(start)]
            if len(prediction) > thredshold[discourse_type]:
                all_predictions.append((n, discourse_type, ' '.join([str(x) for x in prediction])))
                
    # Make dataframe
    df = pd.DataFrame(all_predictions)
    df.columns = ['id', 'class', 'predictionstring']
    
    return df

In [None]:
sub = get_preds(dataset = 'test', text_ids = test_ids, preds = test_preds, thredshold = thredshold)
sub.head()

In [None]:
sub.to_csv('submission.csv', index = False)