### Some dataset names in the data are written in the form:

### dataset_name (abbreviation)

    
In this notebook, I tried to find dataset names using regex.

I made the submission only for results that are not in the training set.

Although it does not have a very high score, it may help you get better results.

---

**How does it work?**

1. Find possible results using regex: **\(([A-Z]{2,}-?[A-Z]{1,}?[a-z]?)[\);]**

        (Gets abbreviations inside parantheses)
        

2. Find tokens before abbreviations

3. Determine if they can be dataset names

4. Remove some results if they contain specified strings

5. Submit

---

**Observations**

1. Some institution names are written in the same format, and they are difficult to differentiate. Some models might be submitting institution names.

    Therefore, removing them from your submissions may improve your results.

2. While creating a training data for models, removing punctuations also removes useful information.

3. Since parantheses come after the dataset name, lstms will miss this information while predicting for the dataset name.

    Using bidirectional models or transformers might be a better idea. I will also try reversing the text.


In [None]:
import json
import pandas as pd
import numpy as np
import glob
import os
import re
from tqdm import tqdm
from fuzzywuzzy import fuzz

test_data_dir = r'/kaggle/input/coleridgeinitiative-show-us-the-data/test'
test_example_names = [fn.split('.')[0] for fn in os.listdir(test_data_dir)]

sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
metadata = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

labels = list(metadata.cleaned_label.unique())
labels = sorted(labels, key = len, reverse = True)
labels = [l.strip() for l in labels]

print(f'labels: {len(labels)}')
sample_sub

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

for t in ['and', 'was', 'in']:
    stop_words.remove(t)

banned_kw = [
    'STEM', 'FDA', 'SSH', 'FSIZE', 'PET', 'NCATE', 'TESOL', 'AVHRR-OI',
    'ICT',
    'AAEA',
    'BMI', 'ADGC', 'CDRSUM', 'NASS',
    'MMSE', 'CDR', 'SPSS', 'LCRP', 'DML', 'ITU', 'DRI', 'CIPSEA', 'IEP', 'NCES', 'BCG', 'HLM', 'MLLW', 'FDG', 'MRMC', 'MEOW'
]

banned_values = [
    'laboratory', 'body mass index', 'admission test', 'neural networks', 'accuracy of', 'chain reaction', 'adversarial network',
    'state exam', 'reform act', 'least', 'labeling', 'principal components analysis', 'independent components analysis', 'markov chain', 'monte carlo',
    'bayesian information', 'family wise error', 'posterior anterior', 'Bidirectional Encoder', 'Morphometry', 'Integral', 'T2*weighted', 'T2-weighted',
    'T2weighted', 'T1*weighted', 'T1-weighted', 'T1weighted', 'EMCI', 'Learning Test', 'Gradepoint average', 'doctor of', 'masters of',
    'Expected Family Contribution', 'life in', 'Long Short Term', 'Long ShortTerm', 'LSTM', 'lipoprotein', 'Support Vector Machine', 'User Interface',
    'National Institute of', 'glucose', 'Research Division', '%', 'Heating Weeks', 'Public Management', 'Theory', 'Middle East respiratory',
    'Discriminant Analysis', 'boltzmann', 'Disease Control and Prevention', 'polymorphism', 'positron emission tomography', 'dorsolateral', 'Data Analysis System',
    'Analysis Kit', 'Google', 'Principal Analysis', 'Cognitive Impairment', 'Analysis of Variance'
    ]

banned_after_tokens = stop_words

banned_values = [b.lower() for b in banned_values]

In [None]:
import re

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
def make_single_whitespace(text):
    return _RE_COMBINE_WHITESPACE.sub(" ", text).strip()

def remove_punc(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt))

def load_test_example_by_name(name):
    doc_path = os.path.join(test_data_dir, name + '.json')
    with open(doc_path) as f:
        data = json.load(f)
    return data

def get_doc_id(doc_path):
    return os.path.split(train_example_names[0])[-1].split('.')[0]

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def get_words_from_abbr_in_parantheses(match, doc_text):
    try:
        match_i = doc_text.index(f'({match})')
    except:
        try:
            match_i = doc_text.index(f'({match};')
        except:
            match_i = doc_text.index(f'({match}')

    match_nopunc = remove_punc(match).replace(' ', '')
    n_tokens = len(match_nopunc.replace(' ', ''))

    slice_start_i = max(match_i - 200, 0)
    slice_end_i = min(match_i + len(match) + 40, len(doc_text))
    doc_slice = doc_text[slice_start_i: slice_end_i]
    
    # Remove dates
    doc_slice = re.sub(r'(19|20)[0-9][0-9]', ' ', doc_slice)

    # Remove parantheses
    doc_slice = doc_slice.replace('(', ' ').replace(')', ' ').replace(';', ' ').replace('-', '').replace('\n', ' ')

    tokens = doc_slice.split(' ')

    tokens = [t for t in tokens if len(t) > 0]
    match_token_i = tokens.index(match_nopunc)

    if match_token_i - n_tokens <= 0:
        return [], [], ""

    start_i = match_token_i-n_tokens
    end_i = match_token_i

    # If a prev token started with uppercase, use it
    try:
        if start_i > 1:
            if tokens[start_i-2][0].isupper():
                start_i -= 2
            else:
                if tokens[start_i-1][0].isupper():
                    start_i -= 1

        word_tokens = tokens[start_i:end_i]

        # Drop number token if it is coming first
        if word_tokens[0].isdigit():
            word_tokens = word_tokens[1:]

        # Remove 2 lowercase tokens from start
        for _ in range(2):
            if word_tokens[0][0].islower():
                word_tokens = word_tokens[1:]
                start_i += 1

    except IndexError:
        print(f'IndexError for {match}')
        return [], [], ""

    after_token = ""
    if len(tokens) > match_token_i + 1:
        after_token = tokens[match_token_i + 1]

    before_tokens = tokens[max(start_i - 3, 0) : start_i]

    return before_tokens, word_tokens, after_token


def tokens_are_dataset_name(tokens):
    if len(tokens) == 0:
        return False
    
    long_tokens = [t for t in tokens if len(t) > 3]
    lowercase_count = len([t for t in long_tokens if t[0].islower()])
    uppercase_count = len([t for t in long_tokens if t[0].isupper()])

    return lowercase_count < 4 and uppercase_count > 0

def after_token_ok(after_token):
    if after_token == "":
        return True

    if after_token.lower() in banned_after_tokens:
        return False

    # Probably plural
    if after_token not in ['was', 'has', 'is', 'this'] and after_token[-1].lower() == 's':
        return False

    # Probably a link
    if 'http' in after_token:
        return False

    # A reference. Datasets don't get referenced like that
    if '[' in after_token and ']' in after_token:
        return False

    # Probably a link
    if '/' in after_token:
        return False

    if 'cell' in after_token:
        return False

    return True

def before_tokens_ok(before_tokens):
    if len(before_tokens) == 0:
        return True

    if 'by' in before_tokens:
        return False

    if 'adjusted' in before_tokens:
        return False

    return True


#get_words_from_abbr_in_parantheses('BDNF', doc_text)

In [None]:
def get_doc_results(doc_id):
    doc_json = load_test_example_by_name(doc_id)
    doc_text = ' '.join([s['text'] for s in doc_json])

    re_find_par = r'\(([A-Z]{2,}-?[A-Z]{1,}?[a-z]?)[\);]'
    matches = set(re.findall(re_find_par, doc_text))
    
    selected_mathces = {}
    for m in matches:
        try:
            before_tokens, found_tokens, after_token = get_words_from_abbr_in_parantheses(m, doc_text)
        except Exception as e:
            print(f'Exception for {m}')
            raise e

        cond1 = tokens_are_dataset_name(found_tokens)
        cond2 = after_token_ok(after_token)
        cond3 = before_tokens_ok(before_tokens)

        """if not cond1:
            print(f'{m} : Tokens {found_tokens} do not make a dataset name.')

        if not cond2:
            print(f'{m} : Aftertoken {after_token} was in banlist.')

        if not cond3:
            print(f'{m} : Beforetokens {before_tokens} were in banlist.')"""

        if cond1 and cond2 and cond3:
            selected_mathces[m] = ' '.join(found_tokens)

    # Drop by keyword
    matches_not_banned = {m: v for m, v in selected_mathces.items() if m not in banned_kw}

    # Drop by text
    matches_not_banned = {m: v for m, v in matches_not_banned.items() if not any([b for b in banned_values if b in v.lower()])}
    
    # Drop if last word is 'institute'
    matches_not_banned = {m: v for m, v in matches_not_banned.items() if v.split(' ')[-1].lower() != 'institute'}

    return matches_not_banned

In [None]:
test_preds = []
ids = []
for index, row in sample_sub.iterrows():
    test_id = row['Id']
    try:
        res = get_doc_results(test_id)
        preds = [v for k,v in res.items()]
        preds = [clean_text(p) for p in preds]
        pred_string = '|'.join(preds)
        test_preds.append(pred_string)
            
    except Exception as e:
        test_preds.append("")
        print(e)
        
    ids.append(test_id)

sub_df = pd.DataFrame(columns = ['Id', 'PredictionString'])
sub_df['Id'] = ids
sub_df['PredictionString'] = test_preds
sub_df.to_csv('submission.csv', index = False)

In [None]:
pd.options.display.max_colwidth = 1000
sub_df.head()

In [None]:
"""temp_1 = [x.lower() for x in metadata['dataset_label'].unique()]
temp_2 = [x.lower() for x in metadata['dataset_title'].unique()]
temp_3 = [x.lower() for x in metadata['cleaned_label'].unique()]
existing_labels = set(temp_1 + temp_2 + temp_3)"""

In [None]:
"""test_preds_only_unseen = []
for preds in test_preds:
    tokens = preds.split('|')
    kept_tokens = []
    
    for token in tokens:
        has_similar = False
        for l in existing_labels:
            if jaccard(token, l) >= 0.5:
                has_similar = True
                break
                
        if not has_similar:
            kept_tokens.append(token)
            
    test_preds_only_unseen.append('|'.join(kept_tokens))

sub_df = pd.DataFrame(columns = ['Id', 'PredictionString'])
sub_df['Id'] = ids
sub_df['PredictionString'] = test_preds_only_unseen
sub_df.to_csv('submission.csv', index = False)

sub_df"""