# Because there are many different types of tokenizers, this is a simple eda looking at 
### 1. tokenized text length
### 2. UNK tokens 

This can help guide how long to make your sequence length and what tokens your tokenizer cannot handle. At the end there is also a little glimpse at the context around some of these unk tokens.

In [None]:
from transformers import AutoTokenizer

from datasets import load_dataset

# longformer, roberta all use the same tokenizer
# bert, electra use the same tokenizer

tokenizers = {
    "longformer": AutoTokenizer.from_pretrained("allenai/longformer-base-4096"),
    "bigbird": AutoTokenizer.from_pretrained("google/bigbird-roberta-base"),
    "albert": AutoTokenizer.from_pretrained("albert-base-v2"),
    "xlnet": AutoTokenizer.from_pretrained("xlnet-base-cased"),
    "electra": AutoTokenizer.from_pretrained("google/electra-small-discriminator"), 
    "deberta": AutoTokenizer.from_pretrained("microsoft/deberta-base"), 
}

In [None]:
from pathlib import Path
from tqdm.notebook import tqdm

texts, ids = [], []

for file in tqdm(Path("../input/feedback-prize-2021/train").glob("*.txt"), total=15594, desc="Loading text files from train folder"):
    with open(file) as fp:
        texts.append(fp.read())
    ids.append(file.stem)

for file in tqdm(Path("../input/feedback-prize-2021/test").glob("*.txt"), total=5, desc="Loading text files from test folder"):
    with open(file) as fp:
        texts.append(fp.read())
    ids.append(file.stem)

# Tokenized lengths

In [None]:
from functools import partial
from datasets import Dataset

def tokenize(examples, tokenizer, name):
    
    
    ids = [tokenizer(text, truncation=False)["input_ids"] for text in examples["text"]]
    lengths = list(map(len, ids))
    
    return {
        f"input_ids_{name}": ids,
        f"lengths_{name}": lengths,
        "text": examples["text"]
    }

base_dataset = Dataset.from_dict({"text": texts, "ids": ids})

datasets = {}

for name, tokenizer in tokenizers.items():
    base_dataset = base_dataset.map(
        partial(
            tokenize,
            tokenizer=tokenizer,
            name=name
        ),
        batched=True,
        num_proc=4
    )

In [None]:
length_df = base_dataset.to_pandas()
length_df.head()

In [None]:
import pandas as pd
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()

long_df = pd.wide_to_long(length_df, stubnames="lengths", i="ids", j="name", sep="_", suffix=".*")[["lengths"]].reset_index()

px.histogram(long_df, x="lengths", color="name")

## Truncate to ignore long tail

In [None]:
truncated = long_df[long_df["lengths"]<2000]
px.histogram(truncated, x="lengths", facet_row="name", height=1000)

In [None]:
import numpy as np
import pandas as pd

length_percentiles = pd.DataFrame(columns=["length", "percentile", "model"])

for name in tokenizers.keys():
    column = f"lengths_{name}"
    lengths = length_df[column].values
    percs = np.linspace(0,1,101)
    quantile_lengths = np.quantile(lengths, percs)
    length_percentiles = length_percentiles.append(pd.DataFrame({"length": quantile_lengths, "percentile": percs, "model": [name]*len(percs)}))

In [None]:
fig = px.line(
    length_percentiles, 
    x="length", 
    y="percentile", 
    color="model", 
    title="raw texts",
    labels={"percentile": "percent texts with tokenized length below length"}, 
    height=600)

fig.update_xaxes(range=[0, 2000])

# UNK tokens

In [None]:
def add_unk_tokens(example, tokenizer, name):
    unk_id = tokenizer.unk_token_id
    unk_idxs = [i for i, id_ in enumerate(example[f"input_ids_{name}"]) if id_==unk_id]
    if unk_idxs:
        example[f"{name}_unk_tokens"] = [x for i, x in enumerate(tokenizer.tokenize(example["text"], add_special_tokens=True)) if i in unk_idxs]
    else:
        example[f"{name}_unk_tokens"] = []
    example[f"{name}_num_unk_toks"] = len(example[f"{name}_unk_tokens"])
    return example

for name in tokenizers.keys():
    base_dataset = base_dataset.map(
        partial(
            add_unk_tokens,
            tokenizer=tokenizers[name],
            name=name
        ),
        num_proc=4
    )

In [None]:
for name in tokenizers.keys():
    print(f'Total number of unk tokens ({name}): {sum(base_dataset[f"{name}_num_unk_toks"])}')
    print(f'Average number of unk tokens per text ({name}): {np.mean(base_dataset[f"{name}_num_unk_toks"])}')
    print(f'Median number of unk tokens per text ({name}): {np.median(base_dataset[f"{name}_num_unk_toks"])}', "\n")

In [None]:
from collections import Counter

unk_tokens = {}
unk_counters = {}

for name in tokenizers.keys():
    tkns = []
    for tokens in base_dataset[f"{name}_unk_tokens"]:
        tkns.extend(tokens)
    token_string = "".join(tkns)
    unk_tokens[name] = set(token_string)
    unk_counters[name] = Counter(token_string)

In [None]:
for name in unk_counters.keys():
    print(f"All unique unk tokens for {name}", unk_tokens[name])
    print(f"Unk token counts for {name}", unk_counters[name], "\n")

# Looking at weird characters

In [None]:
all_chars = list(set("".join(texts)))

non_alpha = [x for x in all_chars if not x.isalnum()]
unprintable = [x for x in all_chars if not x.isprintable() and not x.isalnum()]
whitespace = [x for x in all_chars if x.isspace()]

In [None]:
for x in [non_alpha, unprintable, whitespace]:
    print(x, "\n\n")

# Counting weird characters

In [None]:
all_chars = "".join(texts)

non_alpha = [x for x in all_chars if not x.isalnum()]
unprintable = [x for x in all_chars if not x.isprintable() and not x.isalnum()]
whitespace = [x for x in all_chars if x.isspace()]

In [None]:
for x in [non_alpha, unprintable, whitespace]:
    c = Counter(x)
    print(c.most_common(20), "\n\n")

# Looking at a weird character in context

In [None]:
x82 = []
for t in texts:
    if "\x82" in t:
        x82.append(t) 

In [None]:
import random

random.sample(x82, 1)[0]

### It looks like `Ã\x82Â´` should be cleaned to an apostrophe '

Making this change will likely help the model slightly.