This notebook shows how to compute score in local.
Demo is given with simply literal matching.

Any comments/suggestions are welcome!

In [None]:
import json
import numpy as np
import pandas as pd

# Load data

In [None]:
# train
train_path = f'../input/coleridge-cv-data/cv_1_train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

# test
test_path = f'../input/coleridge-cv-data/cv_1_test.csv'
paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
test = pd.read_csv(test_path)

# paper
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

for paper_id in test['Id'].unique():
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

## Literal matching

### Prepare literals

In [None]:
text_to_pred = {}
dataset_texts = set()

for id, pub_title, dataset_title, dataset_label, cleaned_label in train.itertuples(index=False):
    for title, label, cleaned_label in zip(dataset_title.split('|'), 
                                           dataset_label.split('|'), 
                                           cleaned_label.split('|')):
        text_to_pred[title] = cleaned_label
        dataset_texts.add(title)
        text_to_pred[label] = cleaned_label
        dataset_texts.add(label)
    
print(f'text_to_pred size: {len(text_to_pred)}')
print(f'dataset_texts size: {len(dataset_texts)}')

### Match and predict

In [None]:
preds = []
for id in test['Id']:
    paper = papers[id]
    paper = str(paper)
    
    pred = set()
    for dataset_text in dataset_texts:
        if dataset_text in paper:
            pred.add(text_to_pred[dataset_text])
    
    preds.append('|'.join(pred))

test['PredictionString'] = preds

display(test.head())

## Compute score

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def compute_score(y_true, y_pred, beta=0.5):
    TP, FP, FN = 0, 0, 0
    
    for truth, pred in zip(y_true, y_pred):
        true_datasets = truth.split('|')
        # Predicted strings for each publication are sorted alphabetically 
        # and processed in that order.
        pred_datasets = sorted(pred.split('|'))
        
        for true_dataset in true_datasets:
            if len(pred_datasets):
                match_scores = [jaccard_similarity(true_dataset, pred_dataset) 
                                for pred_dataset in pred_datasets]
                # The prediction with the highest score for a given ground truth 
                # is matched with that ground truth.
                match_index = np.argmax(match_scores)

                if match_scores[match_index] >= 0.5:
                    # Any matched predictions where the Jaccard score meets or
                    # exceeds the threshold of 0.5 are counted as true positives (TP),
                    TP += 1
                else:
                    # the remainder as false positives (FP).
                    FP += 1
                
                del(pred_datasets[match_index])
            else:
                # Any ground truths with no nearest predictions are counted as 
                # false negatives (FN).
                FN += 1
        # Any unmatched predictions are counted as false positives (FP).
        FP += len(pred_datasets)
    
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f_score = (1 + beta**2)*(precision*recall)/((beta**2)*precision + recall)
    
    return f_score

In [None]:
compute_score(test['cleaned_label'], test['PredictionString'])