# About

Brif data exploration. Rule-based Spacy model (pattern matching) as a baseline.


In [None]:
import pandas as pd
import numpy as np
import os
import re

import json
from pathlib import Path
from collections import OrderedDict

from typing import List, Dict, Set
from thinc.api import Model

import spacy
from spacy.tokens import Doc
from spacy.matcher import Matcher
from spacy import displacy

from nltk import tokenize
from tqdm import tqdm
import gc

# Data

Publications are provided in JSON format, broken up into sections with section titles. The goal in [this competition](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview) is not just to match known dataset strings but to generalize to datasets that have never been seen before using NLP and statistical techniques. A percentage of the public test set publications are drawn from the training set - not all datasets have been identified in train, so these unidentified datasets have been used as a portion of the public test labels. These should serve as guides for the difficult task of labeling the private test set. [Details](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/data)

In [None]:
TRAIN_PATH = '../input/coleridgeinitiative-show-us-the-data/train/'
TEST_PATH = '../input/coleridgeinitiative-show-us-the-data/test/'

In [None]:
df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
df.info()

In [None]:
df.head()

In [None]:
df.select_dtypes(include=[object]).describe().T

# Id & Unique Labels

In [None]:
# Get all labels for each Id 
def get_labels(df: pd.DataFrame) -> Dict[str, Set[str]]:
    ids = df.Id.unique().tolist()
    output = {}
    for i in ids:
        labels = df.loc[df.Id == i]['cleaned_label'].unique().tolist()
        output[i] = labels

    return output

In [None]:
%%time
id_labels = get_labels(df)

In [None]:
len(id_labels)

In [None]:
id_labels['d0fa7568-7d8e-4db9-870f-f9c6f668c17b']

# Patterns

```python
pattern_sample = [
    {'LOWER': 'national'},
    {'LOWER': 'education'},
    {'LOWER': 'longitudinal'},
    {'LOWER': 'study'}
]
```

In [None]:
BasePattern = List[Dict[str, str]]

In [None]:
# cleaned label to label abbreviation 
def get_abbreviation(words: List[str]) -> str:
    return ''.join([i[0].upper() for i in words])

In [None]:
def fetch_pattern(label: str) -> Dict[str, BasePattern]:
    words = label.split()
    pattern = []
    for word in words:
        lower = {}
        lower['LOWER'] = word.lower()
        pattern.append(lower)
    return {label: pattern}

In [None]:
fetch_pattern('Hello World')

In [None]:
def create_patterns(labels: List[str]) -> List[Dict[str, BasePattern]]:
    patterns = []
    for label in labels:
        pattern = fetch_pattern(label)
        patterns.append(pattern)        
    return patterns

In [None]:
labels = df.cleaned_label.unique().tolist()
len(labels)

In [None]:
%%time
PATTERNS = create_patterns(labels)

In [None]:
PATTERNS[:5]

# Matcher

In [None]:
model_name = "en_core_web_sm"
nlp = spacy.load(model_name, disable=["tagger", "parser", "ner"])

In [None]:
# Add created patterns to Matcher
def add_patterns(model: Model) -> Matcher:
    matcher = Matcher(model.vocab, validate=True)
    for name_pattern in PATTERNS:
        for name, pattern in name_pattern.items():
            matcher.add(name, [pattern], on_match=None)
    return matcher

In [None]:
matcher = add_patterns(nlp)

# Metric - Micro FBeta Score

In [None]:
# https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/discussion/230091
# https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation
def compute_fbeta(y_true: List[List[str]],
                  y_pred: List[List[str]],
                  beta: float = 0.5) -> float:
    """Compute the Jaccard-based micro FBeta score.
    """

    def _jaccard_similarity(str1: str, str2: str) -> float:
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

    tp = 0  # true positive
    fp = 0  # false positive
    fn = 0  # false negative
    for ground_truth_list, predicted_string_list in zip(y_true, y_pred):
        predicted_string_list_sorted = sorted(predicted_string_list)
        for ground_truth in sorted(ground_truth_list):            
            if len(predicted_string_list_sorted) == 0:
                fn += 1
            else:
                similarity_scores = [
                    _jaccard_similarity(ground_truth, predicted_string)
                    for predicted_string in predicted_string_list_sorted
                ]
                matched_idx = np.argmax(similarity_scores)
                if similarity_scores[matched_idx] >= 0.5:
                    predicted_string_list_sorted.pop(matched_idx)
                    tp += 1
                else:
                    fn += 1
        fp += len(predicted_string_list_sorted)

    tp *= (1 + beta ** 2)
    fn *= beta ** 2
    fbeta_score = tp / (tp + fp + fn)
    return fbeta_score

In [None]:
# Check metric sanity
compute_fbeta(
    [['national education longitudinal study','slosh model']],
    [['education', 'slosh model']]
)

# Predictions

In [None]:
def read_json(path: str, pub_id: str) -> None:
    path = Path(path + pub_id + '.json')
    with open(path, 'rt') as json_file:
        return json.load(json_file, object_hook=OrderedDict)

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
def find_patterns(docs: Doc) -> List[str]:
    labels = set()
    for doc in docs:
        for match_id, start, end in matcher(doc):
            label = doc.vocab.strings[match_id]
            labels.add(label)
    if not labels:
        labels.add('')
    return list(labels)

In [None]:
def get_preds(ids: List[str], path: str, model: Model) -> Dict[str, List[str]]:
    
    output = {}
    for pub_id in tqdm(ids):
        json_file = read_json(path, pub_id)
        
        # JSON data to the list of lists [sentences of each section of the publication]
        sentences = [tokenize.sent_tokenize(passage['text']) for passage in json_file]
        
        # Flattening the list to create a list on all sentences for each publication Id
        flatten = lambda lst: [clean_text(item) for sublist in lst for item in sublist]
        sentences = flatten(sentences)
        
        # Get all Doc objects to operate on sentence level
        docs = []
        for sent in sentences:
            doc = model(sent)
            docs.append(doc) 
            
        # Rule-based matching
        output[pub_id] = find_patterns(docs)
    gc.collect()
        
    return output

In [None]:
ids = df.Id.unique().tolist()
len(ids)

In [None]:
# Check the output format of the model
get_preds(ids[:2], TRAIN_PATH, nlp)

In [None]:
# Predictions from pattern matching model
preds = get_preds(ids[:1000], TRAIN_PATH, nlp)

# Evaluation 

In [None]:
def filter_ids(id_labels: Dict[str, List[str]], ids: List[str]) -> Dict[str, List[str]]:
    return {pub_id: id_labels[pub_id] for pub_id in ids}

In [None]:
# Check the filtering output
filter_ids(id_labels, ids[:2])

In [None]:
# References for the evaluation
refs = filter_ids(id_labels, ids[:1000])
len(preds), len(refs)

In [None]:
ref_list = [v for k,v in refs.items()]
pred_list = [v for k,v in preds.items()]
fbeta = compute_fbeta(ref_list, pred_list)
print(f'FBeta Score for sample of {len(preds)} publications: {fbeta:.4f}')

# Test

In [None]:
test = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
test_ids = test.Id.to_list()
test_preds = get_preds(test_ids, TEST_PATH, nlp)

In [None]:
data = {'Id': [], 'PredictionString': []}
for i, labels in test_preds.items():
    data['Id'].append(i)
    labels = '|'.join(labels)
    data['PredictionString'].append(labels)
    
sub = pd.DataFrame.from_dict(data)
sub.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('./submission.csv')

# Notes

## Research

- [ ] Study the inconsistencies to expand the pattern types.
- [ ] The publications in the training dataset are poorly labeled, the test set seems to have many more labels than there are in the train set. Find external public data to extract more labels to better generalize to datasets so improve the rule-based and statistical models.
- [ ] Review possible ML approaches for automatic data labeling.

## Code

- [x] Basic Patterns
- [x] Spacy Matcher
- [x] Improve Metric
- [x] Review annotations
- [ ] Add docs