In [None]:
%%capture
!pip install contractions
!pip install textacy

In [None]:
import numpy as np
import pandas as pd

import re
import html
import unicodedata
import contractions
from textacy.similarity.edits import hamming
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

np.random.seed(0)

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

# Searching for Similar Texts

---

For this task, we will compute the similarity between two strings using the Hamming distance, which gives the number of characters in the corresponding string indices that differ, including characters in the longer string that have no correspondence in the shorter one. In my case, the choice is based mainly on execution speed - it calculates the pairwise similarity between all excertps in the training dataset in 30 seconds, while other [document similarity metrics (Levenshtein, Jaro, character ngrams)](https://textacy.readthedocs.io/en/0.11.0/api_reference/similarity.html) from the same library that I have tested perform the same tasks with a time consumption from 1.5 to 12 hours. If you have any other solutions that helped you find more duplicates, I would appreciate it if you share them in the comments. Also, if you would like more information about the train dataset visit [CommonLit: In-Depth EDA + Baseline](https://www.kaggle.com/oleksandrsirenko/commonlit-in-depth-eda-baseline/notebook).

In [None]:
def compute_similarity(df, metric):
    for i in tqdm(range(len(df))):
        target_str = df.iloc[i]['excerpt']
        col_name = df.iloc[i]['id']
        df[col_name] = df.excerpt.apply(lambda compare_str: metric(target_str, compare_str))
    return df

In [None]:
sim_df = compute_similarity(train_df, hamming)

In [None]:
def get_duplicates(df, similarity=0.3):
    cols = df.id.values.tolist()
    doc = {col: df.loc[(df[col] >= similarity) & (df.id != col)][['id', col]].values.tolist() for col in cols}
    return {k: v for k, v in doc.items() if len(v) >= 1}

In [None]:
get_duplicates(sim_df)

In [None]:
train_df.loc[train_df.id.isin(['da2dbbc70', 'dab96a9ab', 'd2556a097', '0684bb254'])][['id', 'target', 'excerpt']].values.tolist()

# Drop Duplicates

---

As we can see, the first pair of excerpts(`da2dbbc70`, `dab96a9ab`) are almost identical and have close targets values (**-0.811519925,  -0.855847764**), they differ only in the presence of a short monologue in quotes at the end of the second passage. I think we can drop any of these texts here. Another pair of similar excerpts (`d2556a097`, `0684bb254`) are completely the same but have significantly different targets (**0.810874254, 0.10280278**). Here I prefer to drop the last one because subjectively text is simple, which corresponds to a more positive target value.

In [None]:
drop_ids = ['da2dbbc70', '0684bb254']
train_df = train_df.loc[~train_df.id.isin(drop_ids)][['id', 'excerpt', 'target', 'standard_error']].reset_index(drop=True)
train_df.head()

# Text Cleaning

---

In this part, we will try to clean the text without further processing such as tokenization, lemmatization, stemming, etc. The goal is to purify the text by handling the artifacts, normalize it, correct clitics, and return in the same format. The refined text can then be used for further processing according to the chosen model, tokenizer, or preparation techniques.

In [None]:
RE_LINEBREAK = re.compile(r"(\r\n|[\n\v])+")
RE_HYPHEN = re.compile(r'-(?!\w)|(?<!\w)-') # all hyphens except ones between words
RE_SINGLE_QUOTE = re.compile(r'[\u2039\u203a\u2018\u2019\u201a\u201b\u275b\u275c\u275f\u02bc\u0060]')
RE_DOUBLE_QUOTE = re.compile(r'[\u00ab\u00bb\u201c\u201d\u201e\u201f\u2e42\u301d\u301d\u301f]')
RE_CURRENCY_SYMBOL = re.compile(r"[$¢£¤¥ƒ֏؋৲৳૱௹฿៛ℳ元円圆圓﷼\u20A0-\u20C0]")
RE_NUMBER = re.compile(r'\d+')
RE_SPACES = re.compile(r'\s{2,}') # 2+ spaces
RE_WORD = re.compile(r"([\w']+)") # words including contractions
RE_SENTENCE = re.compile(r'(?<=[.])\s')
RE_CONTRACTIONS = re.compile(r"([\w]+['][\w]+)")
RE_PUNCT = re.compile(r'[^\w\s.,!?\-]') # all punctuation except .,!?-

In [None]:
# {'NAME': (PATTERN, REPL)}
PATTERNS = {
    'RE_LINEBREAK': (RE_LINEBREAK, ' '),
    'RE_SINGLE_QUOTE': (RE_SINGLE_QUOTE, '\u0027'),
    'RE_DOUBLE_QUOTE': (RE_DOUBLE_QUOTE, '\u0022'),
    'RE_CURRENCY_SYMBOL': (RE_CURRENCY_SYMBOL,'_CUR_ '),
    'RE_NUMBER': (RE_NUMBER, ' _NUM_'),
    'RE_HYPHEN': (RE_HYPHEN, ' '),
    'RE_PUNCT': (RE_PUNCT, ''),
    'RE_SPACES': (RE_SPACES, ' ')
}

In [None]:
def impurity(text, patterns):
    to_clean = []
    for _, pattern in patterns.items():
        to_clean.append(len(pattern[0].findall(text)))
    contr_len = sum([len(i) for i in RE_CONTRACTIONS.findall(text)])    
    to_clean.append(contr_len)
    return sum(to_clean) / len(text)

In [None]:
# Check impurity before the cleaning
train_df['impurity'] = train_df['excerpt'].apply(lambda x: impurity(x, PATTERNS))
train_df[['excerpt', 'impurity']].sort_values(by='impurity', ascending=False).head(10)

In [None]:
def impurity_summary(df):
    n_impure = df.loc[df.impurity > 0].shape[0]
    impure_percent = np.round((n_impure * 100 / df.shape[0]), 2)
    mean_impurity = np.round(df.impurity.mean() * 100, 2)
    print(f'Total number of impure excerpts in train dataset is: {n_impure} or {impure_percent}%')
    print(f'Mean impurity: {mean_impurity}%')
    
impurity_summary(train_df)

In [None]:
def remove_accents(text):
    return "".join(
        char for char in unicodedata.normalize("NFKD", text) 
        if not unicodedata.combining(char)
    )

In [None]:
def clean(text, patterns):
    text = html.unescape(text)
    text = remove_accents(text)
    for name, (pattern, repl) in patterns.items():
        if name == 'RE_PUNCT':
            text = contractions.fix(text)
            text = RE_PUNCT.sub('', text)
            continue
        else:
            text = pattern.sub(repl, text)
    return text

In [None]:
train_df['clean_text'] = train_df['excerpt'].apply(lambda x: clean(x, PATTERNS))

In [None]:
# Check impurity after the cleaning
train_df['impurity'] = train_df['clean_text'].apply(lambda x: impurity(x, PATTERNS))
train_df[['clean_text', 'impurity']].sort_values(by='impurity', ascending=False).head()

In [None]:
impurity_summary(train_df)

# Summary

---

In general, we can say that the text is initially quite clean. According to the current scoring function, the mean impurity across the training dataset was only 1.59%. But even this can influence the model performance and provide a better LB score. Feel free to fork, modify and improve the solution to suit your needs, and don't forget to upvote if you like it))