In [None]:
import os
import re
import json
import numpy as np
import pandas as pd
import spacy

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

nlp = spacy.load("../input/coleridge-ner-chain-v02-c04/coleridge_ner", disable=["tagger", "parser"])
nlp.max_length = 2e6
keep_ents = ["EVENT", "FAC", "GPE", "LAW", "LOC", "NORP", "ORG", "PRODUCT", "WORK_OF_ART"]

df = pd.read_csv("../input/coleridgeextradsets/data_set_800.csv")
vocab = df['title'].unique().tolist()

In [None]:
aux_df = []
for f in os.listdir("../input/coleridgeextradsets"):
    if f.startswith("SciData"):
        temp = pd.read_csv(os.path.join("../input/coleridgeextradsets", f))[["Repository Name", "Abbreviation"]]
        aux_df.append(temp)
aux_df = pd.concat(aux_df, ignore_index=True)
aux_df.loc[:, "Repository Name"] = aux_df["Repository Name"].apply(clean_text).str.strip()
aux_vocab = aux_df["Repository Name"].dropna().unique().tolist()
aux_vocab = [v for v in aux_vocab if v not in ["figshare", "intact", "massive", "pride", "nan"]]
vocab += aux_vocab
vocab = list(np.unique(vocab))
ent_vocab = [v for v in vocab if len(v) > 10]

In [None]:
test_dir = "../input/coleridgeinitiative-show-us-the-data/test"
sub_tups = []
id_list = []

for f in os.listdir(test_dir):
    
    Id = f[:-5]
    id_list.append(Id)
    with open(os.path.join(test_dir, f)) as rf:
        txt = json.load(rf)
    
    for sec in txt:
        
        if len(sec["text"]) < 2e6:
            doc = nlp(sec["text"])
            for e in doc.ents:
                if e.label_ == "DATASET":
                    sub_tups.append((Id, clean_text(e.text), "NLP"))
                if e.label_ in keep_ents:
                    e_txt = clean_text(e.text)
                    if any([jaccard(e_txt, v) > 0.95 for v in ent_vocab]):
                        sub_tups.append((Id, v, "match")) 
                    
        txt_cln = clean_text(sec["text"])
        for v in vocab:
            if v in txt_cln:
                sub_tups.append((Id, v, "naive"))                   

In [None]:
sub_0 = pd.DataFrame(sub_tups, columns=["Id", "PredictionString", "Type"])
sub_nlp = sub_0[sub_0["Type"] == "NLP"]
sub_naive = sub_0[sub_0["Type"] == "naive"]
sub_disj = sub_nlp.merge(sub_naive, on=["Id", "PredictionString"], how="left")
sub_disj = sub_disj[sub_disj["Type_y"].isna()][["Id", "PredictionString"]]
sub_1 = pd.concat([sub_naive.drop("Type", 1), sub_disj], ignore_index=True)

In [None]:
sub_match = sub_0[sub_0["Type"] == "match"]
sub_1.loc[:, "Type"] = "stage_1"
sub_disj = sub_match.merge(sub_1, on=["Id", "PredictionString"], how="left")
sub_disj = sub_disj[sub_disj["Type_y"].isna()][["Id", "PredictionString"]]
sub_df = pd.concat([sub_1.drop("Type", 1), sub_disj], ignore_index=True)

In [None]:
min_mentions = 1
sub_df = sub_df.groupby(["Id", "PredictionString"])["Id"].count().rename("N").reset_index()
sub_df.loc[:, "N"] = np.minimum(min_mentions, sub_df["N"].values)
sub_df.loc[:, "PredictionString"] = sub_df.apply(lambda x: "|".join(np.tile(x["PredictionString"], x["N"])), axis=1)
sub_df = sub_df.groupby("Id")["PredictionString"].apply(lambda x: "|".join(x)).reset_index()
id_df = pd.DataFrame(id_list, columns=["Id"])
sub_df = id_df.merge(sub_df, on="Id", how="left").fillna("")
sub_df.to_csv("submission.csv", index=False)