# Custom classifier and NER model with parallel loading
- Tokenize sentences
- Filter sentences with RF classifier
- Add predictions from data labels

## Libraries
Set spaCy dependencies

In [None]:
%%time
!pip install ../input/pandarallel151whl/pandarallel-1.5.1-py3-none-any.whl
!pip uninstall fastai en-core-web-sm en-core-web-lg spacy -y -q
!pip install ../input/spacy3/catalogue-2.0.3-py3-none-any.whl ../input/spacy3/typer-0.3.2-py3-none-any.whl ../input/spacy3/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/pathy-0.5.2-py3-none-any.whl ../input/spacy3/smart_open-3.0.0-py3-none-any.whl ../input/spacy3/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_legacy-3.0.5-py2.py3-none-any.whl -q
!pip install ../input/spacy3/en_core_web_lg-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_md-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_sm-3.0.0-py3-none-any.whl -q
!pip install ../input/spacy3/spacy_alignments-0.8.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_transformers-1.0.2-py2.py3-none-any.whl ../input/spacy3/en_core_web_trf-3.0.0-py3-none-any.whl -q

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import spacy
assert spacy.__version__ == '3.0.6'
from spacy import displacy

import glob
import json
import re
import pickle
import pandas as pd
import numpy as np

## Models
RF and NER components (trained separately, imported as datasets)

### RF model

In [None]:
%%time
pkl_filename = '../input/balanced-rf/RF_balanced_model.pkl'

with open(pkl_filename, 'rb') as file:
    best_model = pickle.load(file)

best_model

### NER model

In [None]:
%%time
custom_ner_model = spacy.load("../input/original-model/spaCy/output/model-best")

## Functions

In [None]:
def jaccard(str1, str2): 
    """
    Defined by the competition
    """
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def clean_text(txt):
    """
    Defined by competition
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def scrub_text(txt):
    """
    Extends text cleaning
    """
    scrub = re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())
    return ''.join([i for i in scrub if not i.isdigit()])

def upper_text(txt):
    """
    Removes special characters and punctuation, retains uppercasing
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

def find_acronyms(txt):
    """
    Returns yes if there is a sequence of capital letters
    """
    matches = re.findall(r"\b[A-Z\.]{2,}s?\b", txt)
    if matches:
        return 1
    else:
        return 0

def extract_acronyms(txt):
    """
    Finds and return a sequence of capital letters
    """
    ac = []
    matches = re.findall(r"\b[A-Z\.]{2,}s?\b", txt)
    if matches:
        for match in matches:
            ac.append(match)
        return ac
    else:
        return ac
    
def count_acronyms(txt):
    """
    Counts sequences of capital letters
    """
    matches = re.findall(r"\b[A-Z\.]{2,}s?\b", txt)
    if matches:
        return len(matches)
    else:
        return 0

def flatten_list(object):
    """
    Flattens lists of lists
    """
    gather = []
    for item in object:
        if isinstance(item, (list, tuple, set)):
            gather.extend(flatten_list(item))            
        else:
            gather.append(item)
    return gather

def filter_set(main_set, condition):
    """
    Removes items from set based on condition
    """
    for elem in list(main_set):
        if condition(elem):
            main_set.discard(elem)

## Target
Predict if a sentence contains a dataset reference

In [None]:
%%time

submission_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

df_train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

test_files = glob.glob('../input/coleridgeinitiative-show-us-the-data/test/*.json')

df_test_pubs = pd.DataFrame()
for test_file in test_files: 
    file_data = pd.read_json(test_file)
    file_data.insert(0,'Id', test_file.split('/')[-1].split('.')[0])
    df_test_pubs = pd.concat([df_test_pubs, file_data])

df_test_pubs['clean_text'] = df_test_pubs['text'].parallel_apply(clean_text)
df_test_pubs['scrub_text'] = df_test_pubs['text'].parallel_apply(scrub_text)

In [None]:
test_sentences = []
for row in df_test_pubs.itertuples():
    sentences = row[3].split(".")
    for sent in sentences:
        test_sentences.append((row[1], row[2], sent))

df_test_sent = pd.DataFrame(test_sentences, columns=['Id', 'section_title', 'sent'])

df_test_sent['sent_clean'] = df_test_sent['sent'].parallel_apply(clean_text)
df_test_sent['section_clean']= df_test_sent['section_title'].parallel_apply(clean_text)

df_test_sent['sent'] = df_test_sent['sent'].astype(str)
df_test_sent['section_title'] = df_test_sent['section_title'].astype(str)

## Labels
Create labels and merge into a set to check if a sentence contains a known data label
* Training data titles
* Training data acronyms
* Data.gov data titles
* ICPSR study names

In [None]:
# icpsr_studies = pd.read_csv('../input/icpsr-study-names/icpsr_studies.csv')
# icpsr_studies_raw = icpsr_studies['NAME'].unique()
# icpsr_studies_clean = icpsr_studies['NAME'].parallel_apply(clean_text).unique()
# icpsr_studies_scrub = icpsr_studies['NAME'].parallel_apply(scrub_text).unique()

# icpsr_labels_1 = set(icpsr_studies_raw);
# icpsr_labels_2 = set(icpsr_studies_clean);
# icpsr_labels_3 = set(icpsr_studies_scrub);

# icpsr_studies_set = set.union(icpsr_labels_1, icpsr_labels_2, icpsr_labels_3);

# icspr_series = pd.read_csv('../input/icpsr-study-names/icpsr_series.csv').dropna()
# icspr_series_raw = icspr_series['TITLE'].unique()
# icspr_series_clean = icspr_series['TITLE'].parallel_apply(clean_text).unique()
# icspr_series_scrub = icspr_series['TITLE'].parallel_apply(scrub_text).unique()

# icpsr_series_1 = set(icspr_series_raw);
# icpsr_series_2 = set(icspr_series_clean);
# icpsr_series_3 = set(icspr_series_scrub);

# icpsr_series_set = set.union(icpsr_series_1, icpsr_series_2, icpsr_series_3);

datagov_labels = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
datagov_labels['title'] = datagov_labels['title'].astype(str)
datagov_labels_raw = datagov_labels['title'].unique()
datagov_labels_clean = datagov_labels['title'].parallel_apply(clean_text).unique()
datagov_labels_scrub = datagov_labels['title'].parallel_apply(scrub_text).unique()

datagov_labels_1 = set(datagov_labels_raw);
datagov_labels_2 = set(datagov_labels_clean);
datagov_labels_3 = set(datagov_labels_scrub);

datagov_labels_set = set.union(datagov_labels_1, datagov_labels_2, datagov_labels_3)

df_train['dataset_title'] = df_train['dataset_title'].astype(str)
df_train['dataset_label'] = df_train['dataset_label'].astype(str)

train_title = df_train['dataset_title'].unique()
train_title = set(train_title);

train_label = df_train['dataset_label'].unique()
train_label = set(train_label);

acronyms_label = df_train['dataset_title'].parallel_apply(extract_acronyms).dropna()
ac_label = set(flatten_list(acronyms_label))

acronyms_title = df_train['dataset_label'].parallel_apply(extract_acronyms).dropna()
ac_title = set(flatten_list(acronyms_title))

acronym_upper = set.union(ac_label, ac_title);

all_labels = set.union(train_title,
                       train_label,
                       acronym_upper,
#                        icpsr_studies_set, 
#                        icpsr_series_set, 
                       datagov_labels_set);

len(all_labels)

## Features

* has indicator terms (binary, count)
* in section (binary)
* has acronym (binary, count)
* has title (binary)

In [None]:
%%time

df_test_sent['freqData'] = df_test_sent['sent_clean'].str.count('data')
df_test_sent['freqEdu'] = df_test_sent['sent_clean'].str.count('edu')
df_test_sent['freqSample'] = df_test_sent['sent_clean'].str.count('sample')
df_test_sent['freqNational'] = df_test_sent['sent_clean'].str.count('national')
df_test_sent['freqSurvey'] = df_test_sent['sent_clean'].str.count('survey')
df_test_sent['freqPublic'] = df_test_sent['sent_clean'].str.count('public')
df_test_sent['freqAvail'] = df_test_sent['sent_clean'].str.count('avail')
df_test_sent['freqNSF'] = df_test_sent['sent_clean'].str.count('nsf')
df_test_sent['freqGov'] = df_test_sent['sent_clean'].str.count('gov')
df_test_sent['freqAccess'] = df_test_sent['sent_clean'].str.count('access')

df_test_sent['hasData'] = np.where(df_test_sent['sent_clean'].str.contains('data'), 1, 0)
df_test_sent['hasEdu'] = np.where(df_test_sent['sent_clean'].str.contains('edu'), 1, 0)
df_test_sent['hasSample'] = np.where(df_test_sent['sent_clean'].str.contains('sample'), 1, 0)
df_test_sent['hasNational'] = np.where(df_test_sent['sent_clean'].str.contains('national'), 1, 0)
df_test_sent['hasSurvey'] = np.where(df_test_sent['sent_clean'].str.contains('survey'), 1, 0)
df_test_sent['hasPublic'] = np.where(df_test_sent['sent_clean'].str.contains('public'), 1, 0)
df_test_sent['hasAvail'] = np.where(df_test_sent['sent_clean'].str.contains('survey'), 1, 0)
df_test_sent['hasNSF'] = np.where(df_test_sent['sent_clean'].str.contains('nsf'), 1, 0)
df_test_sent['hasGov'] = np.where(df_test_sent['sent_clean'].str.contains('gov'), 1, 0)
df_test_sent['hasAccess'] = np.where(df_test_sent['sent_clean'].str.contains('access'), 1, 0)

df_test_sent['inIntro'] = np.where(df_test_sent['section_clean'].str.contains('intro'), 1, 0)
df_test_sent['inDisc'] = np.where(df_test_sent['section_clean'].str.contains('discus'), 1, 0)
df_test_sent['inAbst'] = np.where(df_test_sent['section_clean'].str.contains('abstr'), 1, 0)
df_test_sent['inResult'] = np.where(df_test_sent['section_clean'].str.contains('resul'), 1, 0)
df_test_sent['inConcl'] = np.where(df_test_sent['section_clean'].str.contains('conclu'), 1, 0)
df_test_sent['inMethod'] = np.where(df_test_sent['section_clean'].str.contains('meth'), 1, 0)
df_test_sent['inBack'] = np.where(df_test_sent['section_clean'].str.contains('back'), 1, 0)
df_test_sent['inData'] = np.where(df_test_sent['section_clean'].str.contains('data'), 1, 0)
df_test_sent['inSumm'] = np.where(df_test_sent['section_clean'].str.contains('summ'), 1, 0)
df_test_sent['inAckno'] = np.where(df_test_sent['section_clean'].str.contains('acknowl'), 1, 0)

df_test_sent['hasAcronym'] = df_test_sent['sent'].parallel_apply(find_acronyms)
df_test_sent['freqAcronym'] = df_test_sent['sent'].parallel_apply(count_acronyms)

icpsr = pd.read_csv('../input/icpsr-study-names/icpsr_studies.csv')
icpsr_labels = icpsr['NAME'].apply(clean_text).str.replace('\d+', '')
df_test_sent['hasICPSRTitle'] = df_test_sent['sent_clean'].apply(lambda x: any([k in x for k in icpsr_labels]))
df_test_sent['hasICPSRTitle'] = df_test_sent['hasICPSRTitle'].astype('category').cat.codes

datagov = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
datagov_labels = datagov['title'].apply(clean_text).str.replace('\d+', '')
df_test_sent['hasDATAGOVTitle'] = df_test_sent['sent_clean'].apply(lambda x: any([k in x for k in datagov_labels]))
df_test_sent['hasDATAGOVTitle'] = df_test_sent['hasDATAGOVTitle'].astype('category').cat.codes

df_test_sent.info()

In [None]:
df_test_sent = df_test_sent.drop(columns=['section_title', 
                                          'sent_clean', 
                                          'section_clean'])

df_test_sent.head()

In [None]:
X_new = df_test_sent.iloc[:, 2:]

print(df_test_sent.shape)
print(X_new.shape)

## Classifier
Filter tokenized test sentences by keeping likely  sentences classified as `prob_1`

In [None]:
np.random.seed(1)

%time predict = best_model.predict(X_new)

In [None]:
prob = best_model.predict_proba(X_new)

df_test_sent['prob_0'] = prob[:,0] 
df_test_sent['prob_1'] = prob[:,1]

In [None]:
df_candidates = df_test_sent.query('prob_1 >= 0.95')

print("Percent of all sentences to pass to NER model: \n")
print((len(df_candidates)/len(df_test_sent))*100)

## Solution
* Apply custom NER model to candidate sentences only and extract predicted entities
* Add labels to predictions

In [None]:
df_test_pubs.head()

In [None]:
%%time

result = []

for index in submission_df.Id:
    raw_text = df_test_pubs[df_test_pubs['Id'] == index].text.str.cat(sep='\n')
    clean_text = df_test_pubs[df_test_pubs['Id'] == index].clean_text.str.cat(sep='\n')
    scrub_text = df_test_pubs[df_test_pubs['Id'] == index].scrub_text.str.cat(sep='\n')
    all_text = raw_text + " " + clean_text + " " + scrub_text
    label = set()
    for mention in all_labels:
        if mention in all_text:
#             label.add(clean_text(mention))
            label.add(re.sub(r'[^\w\s]','',mention).lower())
    publication_text = df_candidates[df_candidates['Id'] == index].sent
    for candidate in publication_text:
        candidate = upper_text(candidate)
        doc = custom_ner_model(candidate)
        if len(doc.ents) > 0:
            label.add(clean_text(doc.ents))
    label_list = sorted(list(label))
    result.append('|'.join(label_list))

for hit in result:
    print(hit, "\n")

In [None]:
submission_df['PredictionString'] = result
submission_df.to_csv("./submission.csv",index=False)
submission_df