#### What can we gain from from resolving encoding errors?

Discussion here
> https://www.kaggle.com/code/brandonhu0215/feedback-deberta-large-lb0-619/comments

@vad13irt
> I checked it too, but on the validation performance, and results were worse. It is weird.

@ivanaerlic
> Yeah, I got worse results with resolve_encodings_and_normalize too.

# 1. Import & Def & Set & Load

In [None]:
import pandas as pd

import codecs
from text_unidecode import unidecode
from typing import Tuple

from transformers import AutoModel, AutoTokenizer

In [None]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
    
    return text

In [None]:
MAX_LEN = 150
CHECK_ROW = 15
RANDOM_STATE = 42

pd.set_option('display.max_colwidth', MAX_LEN)

In [None]:
data_path = "../input/feedback-prize-effectiveness/train.csv"
cols_list = ['essay_id', 'discourse_text']

df_origin = pd.read_csv(data_path, usecols=cols_list)

df_origin.head()

# 2. Update text & Select rows

In [None]:
df = df_origin.copy()

df['discourse_text'] = df['discourse_text'].str.strip()
df['discourse_text_UPD'] = df['discourse_text'].apply(resolve_encodings_and_normalize)

len_mask = df['discourse_text'].str.len() < MAX_LEN
diff_mask = (df['discourse_text'] != df['discourse_text_UPD'])

df = df.loc[diff_mask & len_mask, :]
df.head()

In [None]:
print(diff_mask.sum())  # df['discourse_text'] != df['discourse_text_UPD']
print(len(df))          # diff_mask & len_mask

# 3. Load tokenizers

In [None]:
tokenizers_info = [
    ('deberta', '../input/feedback-deberta-large-051/tokenizer'),
    ('roberta', '../input/roberta-base')
]

In [None]:
tokenizers_dict = {}

for x in tokenizers_info:
    name, path = x
    tokenizers_dict[name] = AutoTokenizer.from_pretrained(path)
    
print(tokenizers_dict.keys())

# 4. Check tokenizers

In [None]:
samples = df.sample(n=CHECK_ROW, random_state=RANDOM_STATE).sort_index()

for row in samples.iterrows():
    indx, data = row
    discourse_text = data.discourse_text
    discourse_text_UPD = data.discourse_text_UPD

    print(f'\n\tindex: {indx}')        
    
    for x in tokenizers_dict.keys():
        print(f'\n\t=== === tokenizer: {x} === ===')
        print()
        print('Origin text: ', repr(discourse_text))
        print('tokens:      ', tokenizers_dict.get(x).tokenize(discourse_text))        
        print('input_ids:   ', tokenizers_dict.get(x)(discourse_text)['input_ids'])
        print()
        print('Updated text:', repr(discourse_text_UPD))
        print('tokens:      ', tokenizers_dict.get(x).tokenize(discourse_text_UPD))        
        print('input_ids:   ', tokenizers_dict.get(x)(discourse_text_UPD)['input_ids'])
