![](https://i.ibb.co/gTQNCML/Screenshot-from-2021-03-27-17-26-18.png)

# 1. Read data

In [None]:
import json
import glob
import re
import os
import numpy as np
import pandas as pd
from os.path import join
from spacy import displacy
from datetime import datetime
from typing import Callable, Optional
from fuzzywuzzy import fuzz

In [None]:
# DIRECTORY TREE
DATA_DIR = "../input/coleridgeinitiative-show-us-the-data"

# spacy.displacy settings
LABEL_DT = "DT"  # dataset_title
COLORS = {LABEL_DT: "#FF0000"}
OPTIONS = {"ents": [LABEL_DT], "colors": COLORS}

train_files = glob.glob(join(DATA_DIR, "train/*.json"))
test_files = glob.glob(join(DATA_DIR, "test/*.json"))
train_file = join(DATA_DIR, "train.csv")
id_to_path = {
    os.path.split(path)[-1][:-5]: path for path in train_files
}
assert len(id_to_path) == len(train_files)
assert os.path.isfile(train_file)
print("Train Atricles found", len(train_files))
print("Test Articles found", len(test_files))

def r_json(path):
    with open(path) as fr:
        doc = json.load(fr)
    return doc

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
df_train = pd.read_csv(train_file)
df_train.describe()

In [None]:
df_train.head()

# 2. Dataset Label instances
Let's gain some intuition about the difference between `dataset_title` and `dataset_label`

In [None]:
df_train.query("dataset_title != dataset_label")[["dataset_title", "dataset_label"]].drop_duplicates().head(10)

In [None]:
def find_matches_for_instance(instance: pd.Series, clean_function: Optional[Callable] = None):
    """
    1. Find corresponding article
    2. Localize Dataset Name Instances (dataset_label)
    3. Prepare data for spacy.displacy
    """
    article = r_json(id_to_path[instance["Id"]])
    docs = []
    pattern = instance["dataset_label"]
    if clean_function:
        pattern = clean_function(pattern)
    for part in article:
        title = part["section_title"]
        txt = part["text"]
        if clean_function:
            txt = clean_function(txt)
        matches = list(re.finditer(pattern, txt))
        if matches:
            docs.append({
                "text": txt,
                "ents": [{"start": m.start(), "end": m.end(), "label": LABEL_DT} for m in matches],
                "title": title,
            })
    return docs

In [None]:
#### CHANGE THIS ID TO BROWSE DATASET ####
row_idx = 0

#### UNCOMMENT LINE BELOWE TO USE clean_text as cleaning policy
docs = find_matches_for_instance(df_train.iloc[row_idx])
# docs = find_matches_for_instance(df_train.iloc[row_idx], clean_function=clean_text)

displacy.render(docs, style="ent", manual=True, options=OPTIONS, page=True)

# 3. Investigate cleaning policy

Unfortunately `dataset_label` value does not match exactly witch the text in the article. We will investigate what are the main differences between them.

In [None]:
def summarize_cleaning_policy(clean_function=None):
    start = datetime.now()
    matched_ids = []
    missing_ids = []
    for idx in range(len(df_train)):
        docs = find_matches_for_instance(df_train.iloc[idx], clean_function=clean_function)
        if docs:
            matched_ids.append(idx)
        else:
            missing_ids.append(idx)
    print(f"Processing time {datetime.now() - start}")
    print(f"Matched ids: {len(matched_ids)}")
    print(f"Missing ids: {len(missing_ids)}")
    
    ### error analysis
    if not missing_ids:
        print("Every dataset instance matched! GREAT!")
    else:
        for idx in range(min(len(missing_ids), 5)):
            instance = df_train.iloc[missing_ids[idx]]
            pattern = instance["dataset_label"]
            article = r_json(id_to_path[instance["Id"]])
            found = []
            pat_len = len(pattern)
            stride = 5
            fuzz_threshold = 90  # range from 0-100, 100 means match
            for part in article:
                txt = part["text"]
                for start_idx in range(0, len(txt), stride):
                    chunk = txt[start_idx:start_idx + pat_len + stride]
                    # TODO: FIX fuzz.partial_ratio("aaa bbb", " ") = 100, which is not desired!
                    if fuzz.partial_ratio(pattern, chunk) > fuzz_threshold:
                        found.append(chunk)
            print(f"Dataset label = {pattern}")
            print(f"Article Instances = {found}")
            print()
    return matched_ids, missing_ids

In [None]:
found_ids, missing_ids = summarize_cleaning_policy()

In [None]:
found_ids, missing_ids = summarize_cleaning_policy(lambda x: x.lower())

In [None]:
found_ids, missing_ids = summarize_cleaning_policy(clean_text)

# 4. Conlusions
As we may notice there are at least 2 common cases why `dataset_label` label does not match its instance in article text.

1. Some part of paper instance has additional characters like `(` parantheses.
```
Pattern = SLOSH model
Instances = ['s (SLOSH) model ']
```
2. Lowercase/uppercase convention is not consistent.
```
Pattern = National Education Longitudinal Study
Instances = [' using National Education Longitudinal stu', 'g National Education Longitudinal study of']
```
