# This creates data in the BIO format for training an NER model

B = beginning  
I = inside  
O = outside   

Some call it IOB - same thing. https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)

This format can be used with the Hugging Face [example scripts for token classification](https://github.com/huggingface/transformers/tree/master/examples/pytorch/token-classification)

NER has been shown to get around ~~0.5~~ 0.6 on public LB already :)  https://www.kaggle.com/zzy990106/pytorch-ner-infer

If you want just the dataset, use this: https://www.kaggle.com/nbroad/feedbackprize-bio-ner-train-data

### I include 2 different ways of tokenizing. The first tokenizes after breaking at whitespace, the second includes whitespace. 

I'm running some experiments to see which method is better. I have a feeling that the whitespace is valuable information that will get lost when using the first method. 

`split_at_whitespace.json` first splits the text at whitespace and then assigns a label to each token. This does not pass through a Tokenizer object.  
`include_whitespace.json` does not split at whitespace. This output comes out of a Tokenizer. This is tokenizer-specific so make sure you use the right tokenizer :)


### Update: Dec 23 - I came up with a way of correcting the misaligned labels, so I will run my code twice: once on the original file and once on the corrected file

Here is my notebook that corrects the data: https://www.kaggle.com/nbroad/corrected-train-csv-feedback-prize

Folder `original` uses the data given by the hosts.  
Folder `corrected` uses the data from my corrected notebook.

In [None]:
import pandas as pd

train_df = pd.read_csv("../input/feedback-prize-2021/train.csv")

# Option 1: Splitting at whitespace

In [None]:
# First step assigns a discourse type to each word
def add_discourse_type(example):
    
    id_ = example["id"]
    
    features = train_df[train_df["id"]==id_]

    with open(f"../input/feedback-prize-2021/train/{id_}.txt") as fp:
        text = fp.read()

    words = text.split()

    labels = ["O"]*len(words)

    for discourse, predictions in features[["discourse_type", "predictionstring"]].values:
        idx_iter = map(int, predictions.split())
        for idx in idx_iter:
            labels[idx] = discourse 
        
    return {
        "id" : id_,
        "labels": labels,
        "words": words,
    }

In [None]:
%%time

from datasets import Dataset

ds = Dataset.from_dict({"id": train_df["id"].unique()}) 

tagged_ds = ds.map(add_discourse_type, num_proc=4)

In [None]:
# Second step adds B or I as prefix to the label
def add_bi(example):
    new_tags = []
    
    if example["labels"][0] != "O":
        new_tags.append(f"B-{example['labels'][0]}")
    else:
        new_tags.append("O")
    
    for idx in range(1, len(example["labels"])):
        current_tag = example['labels'][idx]
        if current_tag == "O":
            new_tags.append(current_tag)
        elif example["labels"][idx-1] != current_tag:
            new_tags.append(f"B-{current_tag}")
        else:
            new_tags.append(f"I-{current_tag}")
            
    example["bio"] = new_tags
    return example

In [None]:
%%time

bio_dataset = tagged_ds.map(add_bi, remove_columns=["labels"], num_proc=4)
bio_dataset

In [None]:
# Let's look at some
{key: vals[:10] for key, vals in bio_dataset[11].items()}

In [None]:
# Let's check what tags were added
from itertools import chain
from collections import Counter

all_tags_no_whitespace = list(chain(*bio_dataset["bio"]))
most_common_no_whitespace = Counter(all_tags_no_whitespace).most_common(20)
most_common_no_whitespace

In [None]:
bio_dataset.to_json("split_at_whitespace.json")

# Option 2: Keeping whitespace

In [None]:
labels = train_df["discourse_type"].unique()
labels = [f"B-{label}" for label in labels] + [f"I-{label}" for label in labels]
labels.append("O")

len(labels), labels

In [None]:
from transformers import AutoTokenizer

# This uses a bigbird tokenizer, but it will work with any tokenizer type 
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")

In [None]:
import string

WHITESPACE = set(string.whitespace + "\xa0")
def chars_to_word_ids(text, char_label_preds):
    
    all_word_preds = []
    current_id = 0
    current_word_preds = []
    for char, pred in zip(text, char_preds):
        
        # If we are not in whitespace and have do/don't have predictions
        if char not in WHITESPACE:
            current_word_preds.append(pred)
            
        # If we reach whitespace and have predictions
        elif current_word_preds and char in WHITESPACE:
            pass # Figure out a label for this word
            
            # Add that label to the list of word preds
            all_word_preds.append(LABEL)
            current_word_preds = []
            current_id += 1
            
        # If we are in whitespace and do not have predictions
        elif not current_word_preds and char in WHITESPACE:
            pass # do nothing
            
        

    return all_word_preds

def add_labels(example):
    
    id_ = example["id"]

    text = open(f"../input/feedback-prize-2021/train/{id_}.txt").read()
    
    tokenized = tokenizer(text, truncation=True, padding="max_length", max_length=1024, return_offsets_mapping=True)
    
    discourse_data = train_df[train_df["id"]==id_]
    
    char_labels = ["O"]*len(text)
    
    num_chars = len(text)
    
    for start, end, label in discourse_data[["discourse_start", "discourse_end", "discourse_type"]].values:
        for idx in range(int(start), int(end)):
            if idx >= num_chars:
                break
            char_labels[idx] = f"I-{label}"
        char_labels[int(start)] = f"B-{label}"
    
    token_labels = ["O"]*len(tokenized["input_ids"])
    token_offsets = tokenized["offset_mapping"]
    for idx, (start_offset, end_offset) in enumerate(token_offsets):
        if start_offset == end_offset and start_offset == 0:
            continue
        for char_label in char_labels[start_offset:end_offset]:
            token_labels[idx] = char_label
            if char_label.startswith("B-"):
                break
    
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "offset_mapping": token_offsets,
        "labels": token_labels
    }

In [None]:
%%time

from datasets import Dataset

ds = Dataset.from_dict({"id": list(train_df["id"].unique())})

tokenized_ds = ds.map(add_labels, num_proc=4)

In [None]:
# Let's look at some

id_ = ds["id"][101]

sample_labels = add_labels({"id":id_})

list(zip(
    tokenizer.convert_ids_to_tokens(sample_labels["input_ids"]), 
    sample_labels["offset_mapping"], 
    sample_labels["labels"]
))[:100]

In [None]:
# Let's check what tags were added

all_tags_with_whitespace = list(chain(*tokenized_ds["labels"]))
most_common_with_whitespace = Counter(all_tags_with_whitespace).most_common(20)
most_common_with_whitespace

In [None]:
tokenized_ds.to_json("include_whitespace.json")

## Comparison of tags

There are huge differences in "O" and "I-" tags because words get broken into multiple tokens, but there should roughly be the same number of "B-" tags. The values are close enough for me, but if you can think of a better way of labeling it, let me know! While I was doing this, I did notice some issues which I listed here: https://www.kaggle.com/c/feedback-prize-2021/discussion/296524

In [None]:
list(zip(most_common_no_whitespace, most_common_with_whitespace))

In [None]:
# Move into folder
%mkdir original
%mv *.json original/

# Now with the corrected data

In [None]:
new_train_df = pd.read_csv("../input/feedback-prize-corrected-train-csv/corrected_train.csv")

print(train_df.shape, new_train_df.shape)

train_df = new_train_df.drop(columns=["discourse_start", "discourse_end", "predictionstring"])
del new_train_df

train_df = train_df.rename(columns={"new_start": "discourse_start", "new_end": "discourse_end", "new_predictionstring": "predictionstring"})

ds = Dataset.from_dict({"id": train_df["id"].unique()}) 

tagged_ds = ds.map(add_discourse_type, num_proc=4)
bio_dataset = tagged_ds.map(add_bi, remove_columns=["labels"], num_proc=4)

corrected_dir = "corrected"
%mkdir $corrected_dir

bio_dataset.to_json(f"./{corrected_dir}/split_at_whitespace.json")

all_tags_no_whitespace = list(chain(*bio_dataset["bio"]))
most_common_no_whitespace = Counter(all_tags_no_whitespace).most_common(20)

ds = Dataset.from_dict({"id": list(train_df["id"].unique())})
tokenized_ds = ds.map(add_labels, num_proc=4)

all_tags_with_whitespace = list(chain(*tokenized_ds["labels"]))
most_common_with_whitespace = Counter(all_tags_with_whitespace).most_common(20)

tokenized_ds.to_json(f"./{corrected_dir}/include_whitespace.json")

list(zip(most_common_no_whitespace, most_common_with_whitespace))

# Checking if there are any I- labels that don't have a B- before them

In [None]:
counter = 0
for example in tokenized_ds:
    if counter > 10: break
    
    prev = None
    for label in example["labels"]:
        if prev is not None:
            if label.startswith("I-") and prev.replace("B-", "").replace("I-", "") != label.replace("B-", "").replace("I-", ""):
                if label != "O":
                    print(prev, label, example)
                    counter += 1
        prev = label