In [1]:
import json
from itertools import chain

from transformers import AutoTokenizer
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset

np.random.seed(999)

TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 1024
STRIDE=384


data = json.load(Path('/kaggle/input/pii-detection-removal-from-educational-data/train.json').open("r"))

## Paragraph Augmentation

We developed a straightforward method called "paragraph augmentation" to enhance our final score. This method works by analyzing essay texts: if an essay contains fewer than three paragraphs, it remains unchanged. However, if it has three or more paragraphs, we keep the first and last paragraphs fixed, while randomly dropping the middle ones with a probability of 0.5. This approach allows us to potentially generate an exponential number of paragraphs based on the original data.

In [2]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}


def convert_tokens_to_text(tokens, whitespaces):
    text = []
    for token, whitespace in zip(tokens, whitespaces):
        text.append(token)
        if whitespace:
            text.append(" ")
    return "".join(text)

def partial_essay(essay):
    if 'augmented' in essay:
        return essay
    paragraph_inices = [i+1 for i, pair in enumerate(list(zip(essay["tokens"][:-1], essay["tokens"][1:]))) if pair == (".", "\n\n")]
    paragraphs = [{"full_text": convert_tokens_to_text(essay["tokens"][i:j], essay["trailing_whitespace"][i:j]),
                   "document": essay["document"],
                   "tokens": essay["tokens"][i:j],
                   "trailing_whitespace": essay["trailing_whitespace"][i:j],
                   "labels": essay["labels"][i:j]} for i, j in zip([0]+paragraph_inices, paragraph_inices+[len(essay["tokens"])])]
    
    if len(paragraphs) < 3:
        return essay
    
    middle_paragraphs = [par for par in paragraphs[1:-1] if np.random.random() < 0.5]
    chosen_paragraphs = [paragraphs[0]] + middle_paragraphs + [paragraphs[-1]]
    chosen_essay = {"full_text": "".join([par["full_text"] for par in chosen_paragraphs]),
                    "document": essay["document"],
                    "tokens": list(chain(*[par["tokens"] for par in chosen_paragraphs])),
                    "trailing_whitespace": list(chain(*[par["trailing_whitespace"] for par in chosen_paragraphs])),
                    "labels": list(chain(*[par["labels"] for par in chosen_paragraphs]))}
    return chosen_essay

## Example

In [3]:
essay = data[5761]
print(essay["full_text"])

Storytelling is best tool from above

Challenge – I am asked to tell a story in a competition.

Selection - I am so nervous but to create a new story I came forward with my own story. I created  my own story as cartoon story and said in an innovative way.

Application – storytelling is an art to recreate a beautiful story with our imagination. It helps to think  us in different ways and make us to create new ideas from the imagination.

Insight – I gained confidence and became a fearless person that made me feel so confident that I   can face anything in my life

Describe – I feel our life is best story than that of anyothers else.




In [4]:
print(partial_essay(essay)["full_text"])

Storytelling is best tool from above

Challenge – I am asked to tell a story in a competition.

Insight – I gained confidence and became a fearless person that made me feel so confident that I   can face anything in my life

Describe – I feel our life is best story than that of anyothers else.


