In [None]:
import numpy as np
import pandas as pd
import torch 
import transformers
import os
from tqdm import tqdm
import re

In [None]:
training = pd.read_csv("../input/feedback-prize-2021/train.csv")

In [None]:
training

In [None]:
a = "1223".replace("2", ",")
a

In [None]:
def reindex(text:str, idx:str, beginner=0) -> str:
    """
      Func:
        to reindex the text from character index to word index
        
      Args:
        idx: a string indicating the indices with space as the seperater
        
      Returns:
        1. the clean text
        2. the new index
        3. the next index's beginning index
    """
    text = cleaner(text)
    idx = [int(i) for i in idx.split(" ") if len(i)>0]
    if 0 in idx:
        # it's the first pharagraph of an article
        idx = [str(i) for i in range(count_words(text))]
    else:
        idx = [str(i) for i in range(beginner, beginner+count_words(text))]
    
    return text, " ".join(idx), int(idx[-1])+1


def cleaner(text:str):
    """
      Func:
        To clean the text
    """
    notation = """:!~.,?;'\t\n"""
    for n in notation:
        text = text.replace(n, " ")
    while "  " in text:
        text = text.replace("  ", " ")
    
    return text


def count_words(text):
    """
      Returns:
        return the word count
    """
    l = text.split(" ")
    return len(l)




In [None]:
reindex(training.iloc[1, 4], training.iloc[1, 7], 45)

In [None]:
# preprocessing test data
test_id, test_context = [], []
for f in list(os.listdir("../input/feedback-prize-2021/test/")):
    test_id.append(f.split(".txt")[0])
    with open("../input/feedback-prize-2021/test/"+f) as file:
        test_context.append(file.read())
test = pd.DataFrame({"id":test_id,"text":test_context})
test

In [None]:
#preprocessing traininig dataset
train_id, train_context = [], []
for f in tqdm(list(os.listdir("../input/feedback-prize-2021/train"))):
    train_id.append(f.split(".txt")[0])
    with open("../input/feedback-prize-2021/train/" + f) as file:
        train_context.append(file.read())
train_context_df = pd.DataFrame({"id":train_id,"text":train_context})
train_context_df.head()

In [None]:
start_end = training.groupby("id").apply(lambda x: [[x["discourse_start"].to_list()[i],x["discourse_end"].to_list()[i]]for i in range(len(x["discourse_start"].to_list()))]).reset_index(name = "start_end")
class_type = training.groupby("id")["discourse_type"].apply(list).reset_index(name = "class_type")
pred_string = training.groupby("id")["predictionstring"].apply(list).reset_index(name = "string")
disc_text = training.groupby("id")["discourse_text"].apply(list).reset_index(name = "discourse_text")
df = pd.merge(class_type, start_end,how = "inner", on="id")
df = pd.merge(df,pred_string, how = "inner", on = "id")
df = pd.merge(df,train_context_df, how="inner", on="id")
df = pd.merge(df, disc_text, how = "inner", on = "id")

In [None]:
# the competition is basically a multiple classification problem, label is the corresponding discourse_type
labels = training['discourse_type'].unique().tolist()
# create dict to map index to corresponding labels
labs2idx = {label:ids for ids, label in enumerate(labels)}
idx2labs = {ids:label for ids, label in enumerate(labels)}

In [None]:
df.head()

### Done the preprocessing part, check if the start_end is the same as corresponding given text

In [None]:
def get_start_end_text(input_):
    '''
    This function is to check whether the start_end string is different from the given discourse_text
    '''
    ids = input_["id"]
    start_end = input_["start_end"]
    dis_text = input_["discourse_text"]
    text = input_["text"]
    labels_texts = []
    assert len(start_end) == len(dis_text)
    for start_end, txt in zip(start_end, dis_text):
        # check it
        labels_text = text[int(start_end[0]):int(start_end[1])]
        labels_texts.append(labels_text)
    return labels_texts
labels_text = df.apply(lambda x: get_start_end_text(x),axis=1)
df["labels_text"] = labels_text
df.head()

### check whether the string is the same

In [None]:
def check_diff(input_):
    '''
    This function check whether there the labels text is the same as the given_ground truth text
    '''
    dis_text = input_["discourse_text"]
    labels_text = input_["labels_text"]
    assert len(dis_text) == len(labels_text)
    count = 0
    for i, j in zip(dis_text, labels_text):
        if i != j:
            count+=1
    return count
df.apply(lambda x: check_diff(x), axis=1).sum()

### Now change the class type in the whole set to it's corresponding index

In [None]:
df["class_ids"] = df["class_type"].apply(lambda x:[labs2idx[types_] for types_ in x])

### Check our text length, basically using longformer has max_length = 4096

In [None]:
a = df["text"].apply(lambda x: len(re.findall('[a-zA-Z0-9]+',x)))
a_gt = a[a>=4096].index.values
len(a_gt),a_gt

### Use re change the corresponding string and check if the prediction string length is equal to the start-end

In [None]:
df["text"] = df["text"].apply(lambda x: (re.findall('[a-zA-Z0-9]+',x)))

In [None]:
# we can see that they are all equal
check_ = df.apply(lambda x: len(x["start_end"]) == len(x["string"]),axis = 1)
check_[check_ == False].sum()

### create torch dataset

### Using longformer tokenization to tokenize corresponding text into it's own dictionary idxs

In [None]:
#some basic parameter
