# String matching V2
A notebook for matching strings of dataset titles to text in publications using more labels from external datasets.

Improvements
- clean df text
- clean and unique labels


In [None]:
!pip install ../input/pandarallel151whl/pandarallel-1.5.1-py3-none-any.whl

In [None]:
import glob
import re
import pandas as pd
from pandarallel import pandarallel

Initialize parallel processing

In [None]:
pandarallel.initialize(progress_bar=True)

`clean_text` function
- lowercasing all text
- removing all punctuation

`scrub_text` function 
- removing nonalphabetic tokens
- removing words shorter than one character

In [None]:
def clean_text(txt):
    """
    dedined by competition
    """
    return re.sub('[^A-Za-z0-9]+', ' ', txt.lower()).strip()

def scrub_text(txt):
    """
    extends text cleaning
    """
    scrub = re.sub('[^A-Za-z0-9]+', ' ', txt.lower()).strip()
    return ''.join([i for i in scrub.lower() if not i.isdigit()])

def jaccard(str1, str2): 
    """
    defined by the competition
    """
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def extract_acronyms(txt):
    """
    finds and returns a sequence of capital letters
    for use on dataset_titles, dataset_labels, or full text
    """
    ac = []
    matches = re.findall(r"\b[A-Z\.]{2,}s?\b", txt)
    if matches:
        for match in matches:
            ac.append(match)
        return ac
    else:
        return ac
    
def flatten_list(object):
    gather = []
    for item in object:
        if isinstance(item, (list, tuple, set)):
            gather.extend(flatten_list(item))            
        else:
            gather.append(item)
    return gather

def filter_set(main_set, condition):
    """
    used to remove items from label set based on a condition
    """
    for elem in list(main_set):
        if condition(elem):
            main_set.discard(elem)

Load data

In [None]:
submission_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

df_train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

test_files = glob.glob('../input/coleridgeinitiative-show-us-the-data/test/*.json')

df_test_pubs = pd.DataFrame()
for test_file in test_files: 
    file_data = pd.read_json(test_file)
    file_data.insert(0,'Id', test_file.split('/')[-1].split('.')[0])
    df_test_pubs = pd.concat([df_test_pubs, file_data])

df_test_pubs['clean_text'] = df_test_pubs['text'].parallel_apply(clean_text)
df_test_pubs['scrub_text'] = df_test_pubs['text'].parallel_apply(scrub_text)
df_test_pubs

## Labels

SCRUBBED
* Load study titles (n=7,442) and series (n=293) harvested from ICPSR
* Load data titles (n=299,743) harvested from Data.gov CKAN (on 6/4/21)
    * remove digits
    * mask single word entries
* Extract acronyms from training data titles and labels

In [None]:
icpsr_studies = pd.read_csv('../input/icpsr-study-names/icpsr_studies.csv')
icpsr_studies_raw = icpsr_studies['NAME'].unique()
icpsr_studies_clean = icpsr_studies['NAME'].parallel_apply(clean_text).unique()
icpsr_studies_scrub = icpsr_studies['NAME'].parallel_apply(scrub_text).unique()

icpsr_labels_1 = set(icpsr_studies_raw);
icpsr_labels_2 = set(icpsr_studies_clean);
icpsr_labels_3 = set(icpsr_studies_scrub);

icpsr_studies_set = set.union(icpsr_labels_1, icpsr_labels_2, icpsr_labels_3);

icspr_series = pd.read_csv('../input/icpsr-study-names/icpsr_series.csv').dropna()
icspr_series_raw = icspr_series['TITLE'].unique()
icspr_series_clean = icspr_series['TITLE'].parallel_apply(clean_text).unique()
icspr_series_scrub = icspr_series['TITLE'].parallel_apply(scrub_text).unique()

icpsr_series_1 = set(icspr_series_raw);
icpsr_series_2 = set(icspr_series_clean);
icpsr_series_3 = set(icspr_series_scrub);

icpsr_series_set = set.union(icpsr_series_1, icpsr_series_2, icpsr_series_3);
                                       
datagov_titles = pd.read_csv('../input/ckan-datagov-titles/ckan_data_gov_names.csv').dropna()
datagov_titles_unique = datagov_titles['index'].parallel_apply(scrub_text).unique()
datagov_titles_set = set(datagov_titles_unique);
filter_set(datagov_titles_set, lambda x : len(x) < 15)

datagov_labels = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
datagov_labels_raw = datagov_labels['title'].unique()
datagov_labels_clean = datagov_labels['title'].parallel_apply(clean_text).unique()
datagov_labels_scrub = datagov_labels['title'].parallel_apply(scrub_text).unique()

datagov_labels_1 = set(datagov_labels_raw);
datagov_labels_2 = set(datagov_labels_clean);
datagov_labels_3 = set(datagov_labels_scrub);

datagov_labels_set = set.union(datagov_labels_1, datagov_labels_2, datagov_labels_3)

Training labels, titles, acronyms

In [None]:
train_title = df_train['dataset_title'].unique()
train_title = set(train_title);

train_label = df_train['dataset_label'].unique()
train_label = set(train_label);

# scrub_train_title = df_train['dataset_title'].parallel_apply(scrub_text).dropna()
# scrub_train_title = set(scrub_train_title);

# scrub_train_label = df_train['dataset_label'].parallel_apply(scrub_text).dropna()
# scrub_train_label = set(scrub_train_label);

acronyms_label = df_train['dataset_title'].parallel_apply(extract_acronyms).dropna()
ac_label = set(flatten_list(acronyms_label))

acronyms_title = df_train['dataset_label'].parallel_apply(extract_acronyms).dropna()
ac_title = set(flatten_list(acronyms_title))

acronym_upper = set.union(ac_label, ac_title);

Merge all labels into a set and tidy

In [None]:
all_labels = set.union(train_title,
                       train_label,
                       acronym_upper,
                       icpsr_studies_set, 
                       icpsr_series_set, 
                       datagov_titles_set, 
                       datagov_labels_set);

# all_labels.remove('none')

len(all_labels)

## Predictions

In [None]:
preds = []
for index in submission_df['Id']:
    raw_text = df_test_pubs[df_test_pubs['Id'] == index].text.str.cat(sep='\n')
    clean_text = df_test_pubs[df_test_pubs['Id'] == index].clean_text.str.cat(sep='\n')
    scrub_text = df_test_pubs[df_test_pubs['Id'] == index].scrub_text.str.cat(sep='\n')
    all_text = raw_text + " " + clean_text + " " + scrub_text
    label = set()
    for mention in all_labels:
        if mention in all_text:
            label.add(re.sub(r'[^\w\s]','',mention).lower().strip())
    sorted_list = sorted(list(label))
    preds.append('|'.join(sorted_list))

for prediction in preds:
    print(prediction)

In [None]:
submission_df['PredictionString'] = preds
submission_df

In [None]:
submission_df.to_csv('submission.csv',index=False)