In [None]:
import tensorflow as tf
print(tf.__version__)
import transformers
import numpy as np
import pandas as pd
import os
import re
import glob
from tqdm import tqdm
import math
import timeit

device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

In [None]:
train = pd.read_csv(os.path.join("../input/feedback-prize-2021/", "train.csv"))
train[['discourse_id', 'discourse_start', 'discourse_end']] = train[['discourse_id', 'discourse_start', 'discourse_end']].astype(int)

train["discourse_len"] = train["discourse_text"].apply(lambda x: len(x.split()))
train["pred_len"] = train["predictionstring"].apply(lambda x: len(x.split()))

train_txt = glob.glob('../input/feedback-prize-2021/train/*.txt') 

cols_to_display = ['discourse_id', 'discourse_text', 'discourse_type','predictionstring', 'discourse_len', 'pred_len']
train[cols_to_display].head()

# this code chunk is copied from Rob Mulla
len_dict = {}
word_dict = {}
for t in tqdm(train_txt):
    with open(t, "r") as txt_file:
        myid = t.split("/")[-1].replace(".txt", "")
        data = txt_file.read()
        mylen = len(data.strip())
        myword = len(data.split())
        len_dict[myid] = mylen
        word_dict[myid] = myword
train["essay_len"] = train["id"].map(len_dict)
train["essay_words"] = train["id"].map(word_dict)

data_ids = train['id'].unique()

In [None]:
#initialize column
train['gap_length'] = np.nan

#set the first one
train.loc[0, 'gap_length'] = 7 #discourse start - 1 (previous end is always -1)

#loop over rest
for i in tqdm(range(1, len(train))):
    #gap if difference is not 1 within an essay
    if ((train.loc[i, "id"] == train.loc[i-1, "id"])\
        and (train.loc[i, "discourse_start"] - train.loc[i-1, "discourse_end"] > 1)):
        train.loc[i, 'gap_length'] = train.loc[i, "discourse_start"] - train.loc[i-1, "discourse_end"] - 2
        #minus 2 as the previous end is always -1 and the previous start always +1
    #gap if the first discourse of an new essay does not start at 0
    elif ((train.loc[i, "id"] != train.loc[i-1, "id"])\
        and (train.loc[i, "discourse_start"] != 0)):
        train.loc[i, 'gap_length'] = train.loc[i, "discourse_start"] -1


 #is there any text after the last discourse of an essay?
last_ones = train.drop_duplicates(subset="id", keep='last')
last_ones['gap_end_length'] = np.where((last_ones.discourse_end < last_ones.essay_len),\
                                       (last_ones.essay_len - last_ones.discourse_end),\
                                       np.nan)

cols_to_merge = ['id', 'discourse_id', 'gap_end_length']
train = train.merge(last_ones[cols_to_merge], on = ["id", "discourse_id"], how = "left")

#display an example
cols_to_display = ['id', 'discourse_start', 'discourse_end', 'discourse_type', 'essay_len', 'gap_length', 'gap_end_length']
train[cols_to_display].query('id == "AFEC37C2D43F"')

In [None]:
def prepare_text_data(file_name):
    with open(f"../input/feedback-prize-2021/train/{file_name}.txt") as f:
        txt = f.read()
    return [nltk.pos_tag(nltk.word_tokenize(parag)) for parag in re.split("\n\n", txt)]

def add_gap_rows(essay):
    cols_to_keep = ['discourse_start', 'discourse_end', 'discourse_type', 'gap_length', 'gap_end_length']
    df_essay = train.query('id == @essay')[cols_to_keep].reset_index(drop = True)

    #index new row
    insert_row = len(df_essay)
   
    for i in range(1, len(df_essay)):          
        if df_essay.loc[i,"gap_length"] >0:
            if i == 0:
                start = 0 #as there is no i-1 for first row
                end = df_essay.loc[0, 'discourse_start'] -1
                disc_type = "Nothing"
                gap_end = np.nan
                gap = np.nan
                df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
                insert_row += 1
            else:
                start = df_essay.loc[i-1, "discourse_end"] + 1
                end = df_essay.loc[i, 'discourse_start'] -1
                disc_type = "Nothing"
                gap_end = np.nan
                gap = np.nan
                df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
                insert_row += 1
    df_essay = df_essay.sort_values(by = "discourse_start").reset_index(drop=True)

    #add gap at end
    if df_essay.loc[(len(df_essay)-1),'gap_end_length'] > 0:
        start = df_essay.loc[(len(df_essay)-1), "discourse_end"] + 1
        end = start + df_essay.loc[(len(df_essay)-1), 'gap_end_length']
        disc_type = "Nothing"
        gap_end = np.nan
        gap = np.nan
        df_essay.loc[insert_row] = [start, end, disc_type, gap, gap_end]
        
    return(df_essay)

def prepare_train_data(essay):
    df_essay = add_gap_rows(essay)
    #code from https://www.kaggle.com/odins0n/feedback-prize-eda, but adjusted to df_essay
    essay_file = "../input/feedback-prize-2021/train/" + essay + ".txt"
    items = []
    p = 0
    with open(essay_file, 'r') as file: data = file.read()
    
    for i, row in df_essay.iterrows():
        p = int(row['discourse_start'])
        e = int(row['discourse_end'])
        items.append([data[p:e], row['discourse_type']])

    return items

In [None]:
from sklearn.metrics import accuracy_score

#Initialize tokenizer
model_name = "../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased"
bert_tokenizer = transformers.BertTokenizer.from_pretrained(model_name)

dtype_to_num = {"Lead":0,
                "Position":1,
                "Claim":2,
                "Counterclaim":3,
                "Rebuttal":4,
                "Evidence":5,
                "Concluding Statement":6,
                "Nothing":7}

num_to_dtype = {v: k for k, v in dtype_to_num.items()}

num_class_type = 8

In [None]:
def collect_train_sets():
    train_lines = []
    train_labels = []

    max_data_size = 14000
    max_length    = 50
    max_sentence_size = 0


    for i in tqdm(range(max_data_size)):
        for txt in prepare_train_data(data_ids[i]):
            l = len(bert_tokenizer.tokenize(txt[0]))
            if l <= max_length:
                train_lines.append(txt[0])
                train_labels.append(dtype_to_num[txt[1]])
            else:
                for t in re.split("[.?\n]", txt[0]):
                    tsize = len(bert_tokenizer.tokenize(t))
                    if tsize <= 100:
                        max_sentence_size = max([max_sentence_size, tsize])
                        train_lines.append(t)
                        train_labels.append(dtype_to_num[txt[1]])
                    else:
                        for t1 in re.split("[,]", t):
                            max_sentence_size = max([max_sentence_size, len(bert_tokenizer.tokenize(t1))])
                            train_lines.append(t1)
                            train_labels.append(dtype_to_num[txt[1]])

    max_length = max_sentence_size
    print(max_length)
    return [train_lines, train_labels, max_length]

In [None]:
import sys
import pandas as pd

print(pd.DataFrame([[val for val in dir()], [sys.getsizeof(eval(val)) for val in dir()]],
                   index=['name','size']).T.sort_values('size', ascending=False).reset_index(drop=True))

In [None]:
def make_train_data(datas):
    shape = (len(datas), max_length)
    
    input_ids = np.zeros(shape, dtype="int32")
    attention_mask = np.zeros(shape, dtype="int32")
    token_type_ids = np.zeros(shape, dtype="int32")
    
    for i, data in enumerate(datas):
        encoded = bert_tokenizer.encode_plus(datas[i],
                                             max_length=max_length,
                                             pad_to_max_length=True,
                                             truncation=True)
        input_ids[i] = encoded["input_ids"]
        attention_mask[i] = encoded["attention_mask"]
        token_type_ids[i] = encoded["token_type_ids"] 
    
    return [input_ids, attention_mask, token_type_ids]

def build_model():
    input_shape = (max_length,)
    
    input_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    token_type_ids = tf.keras.layers.Input(input_shape, dtype=tf.int32)
    
    bert_model = transformers.TFBertModel.from_pretrained(model_name)
    
    bert_output = bert_model(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
    )

    last_hidden_state = bert_output.last_hidden_state
    pooler_output     = bert_output.pooler_output
    
    output = tf.keras.layers.Dense(num_class_type, activation="softmax")(pooler_output)
    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=[output])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"])
    return model

In [None]:
def model_train(train_lines, train_labels):
    batch_size = 256
    epoch = 15

    test_size = 15
    train_line_size = len(train_lines)

    X_train = make_train_data(train_lines[test_size:train_line_size])
    Y_train = tf.keras.utils.to_categorical(train_labels[test_size:train_line_size], num_classes=num_class_type)

    X_test  = make_train_data(train_lines[0:test_size])
    Y_test  = tf.keras.utils.to_categorical(train_labels[0:test_size], num_classes=num_class_type)
    
    
    model = build_model()
        
    model.summary()
    
    model.fit(
        X_train,
        Y_train,
        batch_size=batch_size,
        epochs=epoch)
    model.save("./bert_trained_model.h5")
    return model

In [None]:
train_lines, train_labels, max_length = collect_train_sets()
with tf.device('/gpu:0'):
    model = model_train(train_lines, train_labels)

In [None]:
test_txt = glob.glob('../input/feedback-prize-2021/test/*.txt')
sub = []

for t in test_txt:
    with open(t, "r") as txt_file:
        myid          = t.split("/")[-1].replace(".txt", "")
        datas         = re.split("[.?\n]", txt_file.read())
        input_X       = make_train_data(datas)
        with tf.device('/gpu:0'):
            X_predict = model.predict(input_X)
        X_predict_num = np.argmax(X_predict, axis=1)
        p = 0
        last_found_dtype = -1
        last_word_list   = []
        for i, data in enumerate(datas):
            if len(data) == 0:
                p += 1
            else:
                word_count = len(bert_tokenizer.tokenize(data))
                if X_predict_num[i] == 7:
                    pass
                else:
                    word_list = [str(x) for x in range(p, p+word_count)]
                    if last_found_dtype == X_predict_num[i]:
                        last_word_list = last_word_list + word_list
                        sub.append((myid, num_to_dtype[X_predict_num[i]], ' '.join(last_word_list)))
                        last_word_list = []
                    else:
                        last_found_dtype = X_predict_num[i]
                        last_word_list   = word_list
                p += word_count
    
df = pd.DataFrame(sub)
df.columns = ['id','class','predictionstring']
df.to_csv('submission.csv',index=False)
df