# Coleridge Initiative - String Literals

The is a fairly naive approach to solving this problem.

- loop over `train.csv`
    - perform basic string cleaning (lowercase + remove non-alphanumeric)
    - create a lookup table for each possible description string (`pub_title`, `dataset_title`, `dataset_label`)
    - map it back to the expected `cleaned_label` string
- brute force search the test dataset for any string literals found in `train.csv` 
    - if multiple matches are found, then pick the one with the most matches
    
    
![](https://i.imgflip.com/536dod.jpg)

In [None]:
import numpy as np 
import pandas as pd
import simplejson
import re
import pydash
import sys
import os
from collections import defaultdict
from typing import *
from joblib import Parallel, delayed
from glob import glob

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_df

# Extract String Literals

Lets create a lookup table for all possible strings used to describe each dataset

In [None]:
def clean_text(text: str) -> str:               return re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
def clean_texts(texts: List[str]) -> List[str]: return [ clean_text(text) for text in texts ] 

In [None]:
def generate_lookup(df):
    lookup = defaultdict(set)
    for _, row in df.iterrows():
        label = clean_text(row['dataset_title'])  # was: row['cleaned_label']
        lookup[ label ] |= set(clean_texts([ 
            row['dataset_label'], 
            row['dataset_title'], 
            row['pub_title'],
            # row['cleaned_label'], 
    ]))
    return lookup

next(iter(generate_lookup(train_df).items()))

# Train Dataset Validation

This validates that this algoritm works on the training dataset, and produces a 100% score

In [None]:
def read_json(index: str, test_train="test") -> Dict:
    filename = f"../input/coleridgeinitiative-show-us-the-data/{test_train}/{index}.json"
    with open(filename) as f:
        json = simplejson.load(f)
    return json
        
def json2text(index: str, test_train="test") -> str:
    json  = read_json(index, test_train)
    texts = [
        row["section_title"] + " " + row["text"] 
        for row in json
    ]
    texts = clean_texts(texts)
    text  = " ".join(texts)
    return text


def extract_label(text: str, lookup: Dict[str, Set[str]]) -> str:
    labels = []
    for label, values in lookup.items():
        for value in values:
            if value in text:                
                labels += [ clean_text(value) ]
            
    label = "|".join(set(labels))  # multi label support
    # label = Counter(labels).most_common(1)[0][0] if len(labels) else ""  # single most-popular label
    # print('extract_label', labels, '->', label)
    return label

In [None]:
%%time
def train_accuracy(df, limit=sys.maxsize) -> float:
    limit   = 100 if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost') == 'Interactive' else limit
    lookup  = generate_lookup(df)
    labels  = Parallel(-1)(
        delayed(extract_label)(json2text(index, "train"), lookup)
        for index in df['Id'][:limit]
    )
    correct   = 0
    expecteds = df['cleaned_label'][:limit]
    for label, expected in zip(labels, expecteds):
        expected_set = set(expected.split("|"))
        label_set    = set(label.split("|"))
        matches      = expected_set & label_set
        correct     += len(matches) / len(label_set)

    # correct = np.count_nonzero( np.array(labels) == expecteds )
    total   = len(expecteds)
    return correct / total

train_accuracy(train_df, 100)

# Submission

In [None]:
def generate_submission():
    submission_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv', index_col=0)
    lookup  = generate_lookup(train_df)
    indexes = submission_df.index
    labels  = Parallel(-1)(
        delayed(extract_label)(json2text(index, "test"), lookup)
        for index in indexes
    )
    submission_df['PredictionString'] = labels
    return submission_df

submission_df = generate_submission()
submission_df.to_csv('submission.csv')
!head submission.csv
submission_df

# Further Reading

If you learnt something from this notebook, or would like to fork, then please leave an upvote! Thank you.


#### Coleridge - Huggingface Question Answering

This is not exactly what the competition metric is asking for, but is an interesting experiment nonetheless.

I've taken the Huggingface Question Answering pre-trained model, and asked it to predict which dataset is referenced (as opposed to the text mentioning it).

- https://www.kaggle.com/jamesmcguigan/coleridge-huggingface-question-answering
