# Refined matching V2
A second attempt solution for smarter string matching with more labels ([Score](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/leaderboard): 0.48)

In [1]:
import json
import re
from itertools import chain

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

## Data

Defined by the competition: `clean_text` and evaluate the token-based `jaccard` similarity evaluation function given by the competition:
- it compares the intersection of a set of strings against a "groundtruth" set of strings
- solutions must be sorted alphabetically

In [9]:
DATA_DIR = '/nfs/turbo/hrg/coleridge/'

In [2]:
def clean_text(txt):
    """
    defined by the competition
    """
    return re.sub('[^A-Za-z0-9]+', ' ', txt.lower())

def remove_stops(txt):
    return [ re.sub(' +', ' ', re.sub('[^A-Za-z0-9\[\]]+', ' ', str(r).lower()).strip()) for r in txt if not r.lower() in stop_words  ]

def jaccard(str1, str2): 
    """
    defined by the competition
    """
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def extract_acronyms(txt):
    """
    finds and returns a sequence of capital letters
    for use on dataset_titles, dataset_labels, or full text
    """
    matches = re.findall(r"\b[A-Z\.]{2,}s?\b", txt)
    if matches:
        return matches
    else:
        return None
    
def flatten_list(object):
    """
    unnests labels
    """
    gather = []
    for item in object:
        if isinstance(item, (list, tuple, set)):
            gather.extend(flatten_list(item))            
        else:
            gather.append(item)
    return gather

def filter_set(main_set, condition):
    """
    used to remove items from label set based on a condition
    """
    for elem in list(main_set):
        if condition(elem):
            main_set.discard(elem)

# def search_window(section, phrase, window_size):
#     """
#     defines a section to search, a search phrase, and a text window size to return
#     for use on full text as a preprocessing step
#     """
#     section = section.split()
#     phrase = phrase.split()
#     words = len(phrase)

#     for i, word in enumerate(section):
#         if word == phrase[0] and section[i:i+words] == phrase:
#             start = max(0, i-window_size)
#             return ' '.join(section[start:i+words+window_size])

In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['take', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'])

In [4]:
df_train = pd.read_csv('../data/train.csv')

In [5]:
submission_df = pd.read_csv('../data/sample_submission.csv')

In [6]:
train_files = '../data/train/'

papers = {}

for paper_id in df_train['Id']:
    with open(f'{train_files}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [7]:
test_files = '../data/test/'

for paper_id in submission_df['Id'].unique():
    with open(f'{test_files}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

## Matching

Load ICPSR study titles and series titles

In [13]:
all_icpsr_studies = pd.read_csv(DATA_DIR + 'labels/icpsr_studies.csv')
all_icpsr_studies['clean'] = all_icpsr_studies['NAME'].apply(clean_text).str.replace('\d+', '')
all_icpsr_studies = all_icpsr_studies[all_icpsr_studies['clean'].str.contains(' ')]
# all_icpsr_studies.head()

all_icspr_series = pd.read_csv(DATA_DIR + 'labels/icpsr_series.csv')
all_icspr_series['TITLE'] = all_icspr_series['TITLE'].astype(str)
all_icspr_series['clean'] = all_icspr_series['TITLE'].apply(clean_text).str.replace('\d+', '')
all_icspr_series = all_icspr_series[all_icspr_series['clean'].str.contains(' ')]
all_icspr_series.head()

Unnamed: 0,ID,TITLE,clean
0,1,ABC News/Washington Post Poll Series,abc news washington post poll series
1,2,American Housing Survey Series,american housing survey series
2,3,American National Election Study (ANES) Series,american national election study anes series
3,4,American Public Opinion and United States Fore...,american public opinion and united states fore...
4,5,Annual Survey of Governments Series,annual survey of governments series


Load data titles (n=299,743) harvested from Data.gov CKAN (on 6/4/21)


In [14]:
all_datagov_titles = pd.read_csv(DATA_DIR + 'labels/ckan_data_gov_names.csv')
all_datagov_titles['clean'] = all_datagov_titles['index'].apply(clean_text).str.replace('\d+', '')
all_datagov_titles = all_datagov_titles[all_datagov_titles['clean'].str.contains(' ')]
all_datagov_titles.head()

Unnamed: 0.1,Unnamed: 0,index,name,clean
0,0,0-2-second-spectral-response-acceleration-5-of...,1,second spectral response acceleration of cr...
1,1,0-25-degree-gfs-for-aoos-region,1,degree gfs for aoos region
2,2,0-25-degree-gfs-for-aoos-region1,1,degree gfs for aoos region
3,3,0-25-degree-gfs-for-aoos-region2,1,degree gfs for aoos region
4,4,0-25-degree-gfs-for-aoos-region3,1,degree gfs for aoos region


Other data posted to Kaggle (unknown origin, probably from Data.gov pull)

In [15]:
datagov_labels = pd.read_csv(DATA_DIR + 'labels/kaggle_data_800.csv')
datagov_labels['clean'] = datagov_labels['title'].apply(clean_text).str.replace('\d+', '')
datagov_labels = datagov_labels[datagov_labels['clean'].str.contains(' ')]
datagov_labels.head()

Unnamed: 0,title,clean
3,noaa c cap,noaa c cap
4,noaa c-cap,noaa c cap
5,slosh model,slosh model
6,noaa tide gauge,noaa tide gauge
7,noaa tide station,noaa tide station


Extract acronyms from data labels

In [18]:
df_train['extract_acronyms_title'] = df_train['dataset_title'].apply(extract_acronyms)
df_train['extract_acronyms_label'] = df_train['dataset_label'].apply(extract_acronyms)
df_train

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,extract_acronyms_title,extract_acronyms_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,,
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,,
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,,
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,,
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,,
...,...,...,...,...,...,...,...
19656,b3498176-8832-4033-aea6-b5ea85ea04c4,RSNA International Trends: A Global Perspectiv...,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID Open Radiology Database,rsna international covid open radiology database,"[RSNA, COVID, RICORD]","[RSNA, COVID]"
19657,f77eb51f-c3ac-420b-9586-cb187849c321,MCCS: a novel recognition pattern-based method...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...,"[CAS, COVID]","[CAS, COVID]"
19658,ab59bcdd-7b7c-4107-93f5-0ccaf749236c,Quantitative Structure–Activity Relationship M...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...,"[CAS, COVID]","[CAS, COVID]"
19659,fd23e7e0-a5d2-4f98-992d-9209c85153bb,A ligand-based computational drug repurposing ...,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...,cas covid 19 antiviral candidate compounds dat...,"[CAS, COVID]","[CAS, COVID]"


## Extra labels
- Filter extra labels by length to reduce noise

In [16]:
extra_labels = set()
    
for label_1, label_2 in datagov_labels[['title', 'clean']].itertuples(index=False):
    extra_labels.add("".join(str(label_1)))
    extra_labels.add("".join(str(label_2)))

for label_3, label_4 in all_datagov_titles[['index', 'clean']].itertuples(index=False):
    extra_labels.add("".join(str(label_3)))
    extra_labels.add("".join(str(label_4)))

for label_5, label_6 in all_icpsr_studies[['NAME', 'clean']].itertuples(index=False):
    extra_labels.add("".join(str(label_5)))
    extra_labels.add("".join(str(label_6)))

for label_7, label_8 in all_icspr_series[['TITLE', 'clean']].itertuples(index=False):
    extra_labels.add("".join(str(label_7)))
    extra_labels.add("".join(str(label_8)))

filter_set(extra_labels, lambda x : len(x) < 4)
len(extra_labels)

490456

## Training data labels

In [19]:
train_labels = set()
for label_5, label_6, label_7, label_8, label_9 in df_train[['dataset_title', 
                                                               'dataset_label', 
                                                               'cleaned_label',
                                                               'extract_acronyms_title', 
                                                               'extract_acronyms_label']].itertuples(index=False):
    train_labels.add("".join(str(label_5)))
    train_labels.add("".join(str(label_5)).lower())
    train_labels.add("".join(str(label_6)))
    train_labels.add("".join(str(label_6)).lower())
    train_labels.add("".join(str(label_7)))
    train_labels.add("".join(str(label_8)))
    train_labels.add("".join(str(label_8)).lower())
    train_labels.add("".join(str(label_9)))
    train_labels.add("".join(str(label_9)).lower())
    
len(train_labels)

379

## Merge labels

In [20]:
all_labels = set.union(extra_labels, train_labels) 
len(all_labels)

490662

Clean labels

In [21]:
flat_labels = flatten_list(all_labels)
flat_labels = [w.replace("['", "") for w in flat_labels]
flat_labels = [w.replace("']", "") for w in flat_labels]
flat_labels = [w.replace('"', "'") for w in flat_labels]

flat_labels.remove('None')
flat_labels.remove('none')

len(flat_labels)

490660

## Predictions

In [22]:
literal_preds = []

for paper_id in submission_df['Id']:
    paper = papers[paper_id]
    text_1 = '. '.join(section['text'] for section in paper) #raw
    text_2 = text_1.lower() #lowercase
    text_3 = clean_text(text_1) #cleaned
#     label = []
    label = set() #is a set first to prevent capture of duplicates, then converted to a list to sort
    for mention in flat_labels:
        if mention in text_1 or mention in text_2 or mention in text_3:
            label.add(clean_text(mention.strip())) #add if a set, append if a list
    label_list = sorted(remove_stops(list(label)))
    literal_preds.append('|'.join(label_list))

for prediction in literal_preds:
    print(prediction)

adni|alzheimer s disease neuroimaging initiative adni|alzheimer s disease neuroimaging initiative adni|california|cap|data a|database|figure|gene b|independence|nces|ng t
addresses|administrative data|cap|colleges and universities|commerce|common core of data|current population|current population survey|data a|data e|data quality|database|earth|figure|integrated postsecondary education data system|international data base|international data base idb|international data base idb|line p|n leads|nces|nces common core of data|ng t|nsf|percent change|private schools|program for international student assessment|progress in international reading literacy study|salary schedule|school survey|schools and staffing survey|schools d|total population|trends in international mathematics and science study
aerial imagery|avon|base flood elevation|beach nourishment|beach nourishment projects|building data|building footprints|cap|coastal area|code of federal regulations|data a|data quality|data set|databas

In [23]:
submission_df['PredictionString'] = literal_preds
submission_df

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,adni|alzheimer s disease neuroimaging initiati...
1,2f392438-e215-4169-bebf-21ac4ff253e1,addresses|administrative data|cap|colleges and...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,aerial imagery|avon|base flood elevation|beach...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,bureau of labor statistics|cap|data a|data e|d...


In [None]:
# submission_df.to_csv("../results/submission.csv",index=False)