# Coleridge Initiative - Show US the Data

I tried two approaches (no dataset label string matching)

* Regular expression - 0.5/0.56
* BERT NER - 0.37/0.52

Ensemble, searching among known datasets and other approaches with BERT did not improve the results

## Regular expressions
* Looking for uppercase letters
* Looking for nearby words with these letters
* Just looking for words beginning with uppercase letters
* I memorize all found and search among other documents
## BERT NER
* I divide the document into sentences of 200-400 characters
* Select potential candidates
* I take 90% of sentences with tags and 10% without tags
* Use 3 classes - no class, first word and last word of dataset name
* Predict and select good candidates

# Moduls

In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import sys
sys.path.insert(0, "../input/transformers/transformers-master/")

from collections import Counter
import os, re, json, torch

from functools import partial

import pandas as pd
import numpy as np

import Levenshtein

from transformers import BertTokenizer,BertTokenizerFast, BertForSequenceClassification, BertConfig, BertForTokenClassification,AutoModelForTokenClassification

# Read data

In [None]:
# reading csv files and train & test file paths
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
def jaccard_similarity(str1: str, str2: str) -> float:
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9\.\:\,\!\?\;\&]+', ' ', str(text)).strip()
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub('\s', ' ', text)
    return text.strip()

In [None]:
%%time
#train_df['text'] = train_df['Id'].apply(read_append_return)
sample_sub['text'] = sample_sub['Id'].apply(read_append_return, train_files_path=test_files_path)

# re.search

In [None]:
sub = {}
new_existing_labels = []
g2 = ['survey',"study","database","data","dataset","initiative",'research']
g = ['survey',"study","database","dataset"]
bad_w = ['consortium','organization','bureau','development','center','table','department','university','bank','class','user'
         'appendix','supplementary','supplement','major','association','journal','commission','associates','board','agency',
        'administration','federation','ministry','form','score','management','accounts','account','feasibility']
bad1 = ['USGS','GWAS','ECLS','aDAS','NCDC','NDBC','UDS','GTD','ISC','DGP','EDC','FDA','TSE','DEA','CDA','IDB','NGDC','JODC','EDM','FADN','LRD','DBDM','DMC','WSC']

for index, row in sample_sub.iterrows():#sample_sub train_df.iloc[2019:2020]
    sub[row['Id']] = ''
    sample_text = text_cleaning(row['text'])
    sample_text1 = sample_text.replace(",","").lower()
    
    b = []
    c = []    
    for i,cl in enumerate(re.finditer(r'[a-z]*[A-Z]{2,5}[a-z]*[A-Z]{1,5}', sample_text)):
        if cl[0].upper() in bad1:
            continue
        x1 = "[a-z ]+".join(list(cl[0].upper()))+'[a-z]+(?: [Dd]ata| [Ss]urvey| [Ss]ample| [Ss]tudy)*'

        s = False
        ans = re.search(x1, sample_text[cl.start()-100:cl.start()])
        if ans:
            s = clean_text(ans[0])
        if s:
            it_bad = False
            for w2 in s.split():
                if w2 in bad_w: 
                    it_bad = True
            if not it_bad:
                if s.split()[-1] in g2:
                    b.append(clean_text(s)) #.rstrip("s")
                    c.append(clean_text(cl[0]))

    

    for s in c:
        if len(sub[row['Id']]) and not s in sub[row['Id']]:
            sub[row['Id']] = sub[row['Id']] + '|'+ s
        elif not len(sub[row['Id']]):
            sub[row['Id']] = s
    
    for s in b:
        if len(sub[row['Id']]) and not s in sub[row['Id']]:
            it_bad = False
            for p in sub[row['Id']].split("|"):
                if p in s:
                    it_bad = True
                    break

            if not it_bad:
                sub[row['Id']] = sub[row['Id']] + '|'+ s
                new_existing_labels.append(s)
        elif not len(sub[row['Id']]):
            sub[row['Id']] = s
            new_existing_labels.append(s)


    a = re.findall(r'(?<=[^\.] )[A-Z][a-z]{3,20} (?:(?:[A-Z][a-z]{2,20}|of|up|to|and|the|in|on|COVID-19|s|for)[- \.,]){0,10}(?:[A-Z][a-z]{3,20})(?: data| survey| sample| study)*', sample_text)
    a = [s for s in a if len(re.findall(r'[A-Z]',s))>2]
    cnt = Counter(a).most_common()
    
    cnt = {k[0]:k[1] for k in cnt if k[1] > 0}
    
    for s in cnt.keys():
        s = clean_text(s)
        if "".join([w[0] for w in s.split()]).upper() in bad1:
            continue
            
        for w in s.split():
            if w in g:
                it_bad = False
                for w2 in s.split():
                    if w2 in bad_w: 
                        it_bad = True
                if not it_bad:
                    if not len(sub[row['Id']]):
                        sub[row['Id']] = s
                        new_existing_labels.append(s)
                    elif len(sub[row['Id']]) and not s in sub[row['Id']]:
                        for p in sub[row['Id']].split("|"):
                            if p in s or Levenshtein.distance(p,s)/len(s)<0.2:
                                it_bad = True
                                break
                        if not it_bad:
                            sub[row['Id']] = sub[row['Id']] + '|'+ s
                            new_existing_labels.append(s)

                    break


In [None]:

cnt = Counter(new_existing_labels).most_common()
cnt = {k[0]:k[1] for k in cnt if k[1] > 1}
new_existing_labels = list(cnt.keys())

new_existing_labels = sorted(new_existing_labels, key=len)[::-1]

for index, row in sample_sub.iterrows():
    if not row['Id'] in sub:
        sub[row['Id']] = ''    
    sample_text = text_cleaning(row['text'])
    for s in new_existing_labels:#
        if ' ' in s and s in sample_text.lower():
            if len(sub[row['Id']]) and not s in sub[row['Id']]:
                it_bad = False
                for p in sub[row['Id']].split("|"):
                    if p in s or Levenshtein.distance(p,s)/len(s)<0.2:
                        it_bad = True
                if not it_bad:
                    sub[row['Id']] = sub[row['Id']] + '|'+ s
            elif not len(sub[row['Id']]):
                sub[row['Id']] = s


# BertForTokenClassification

In [None]:
class VTBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, size_s, labels = None):
        self.encodings = encodings
        self.labels = labels
        self.size_s = size_s
    def __getitem__(self, idx):
        self.item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            self.item['labels'] = torch.tensor(self.labels[idx])
        return self.item

    def __len__(self):
        return self.size_s

In [None]:
def encode_tags(labels, encodings):
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels[:sum((arr_offset[:,0] == 0) & (arr_offset[:,1] != 0))]

        encoded_labels.append(doc_enc_labels.tolist())
    return encoded_labels

In [None]:
BERT_MODEL_PATH = '../input/bert-base-cased'
tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL_PATH,do_lower_case=False)
batch_size_test = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Sigmoid = torch.nn.Sigmoid()

# Предсказание

In [None]:

model = BertForTokenClassification.from_pretrained(BERT_MODEL_PATH,num_labels=3)
model = model.to(device)

ckpt = torch.load("../input/07-model/0.94model.bin",map_location=device)
model.load_state_dict(ckpt)   
model.eval()
good_word = ['resource','report','research','survey','agriculture','service',"study","database","program","data","dataset","assessment",'monitoring','surveys','initiative','system','student',
    'observation','census','directory','reports','statistics','codes','student','students','baccalaureate','sample','project','initiatives']
for index, row in sample_sub.iterrows():#
    ans = []
    sample_text = text_cleaning(row['text'])
    sample_text = sample_text.split(". ")
    
    sent = ''
    new_s = []
    for i,sentens in enumerate(sample_text):
        if not sent:
            
            if len(sentens)>200 and len(sentens)<400:
                new_s.append(sentens)
            elif len(sentens)>=400:
                new_s.extend(re.findall(r'(?: |^).{0,150}[A-Z][a-z]{2,20} (?:(?:[A-Z][a-z]{2,20}|of|up|to|and|the|in|on|COVID-19|s|for|[0-9]{4}})[- \.,]){0,10}(?:[A-Z][a-z]{2,20})(?: data| survey| sample| study| [0-9]{2,4})*.{0,150}(?:[\. ]|$)', sentens))
                new_s.extend(re.findall(r'(?: |^).{0,200}(?: [Dd]ata| [Rr]egistry|[Gg]enome [Ss]equence| [Mm]odel| [Ss]tudy| [Ss]urvey).{0,200}(?:[\. ]|$)', sentens))
                new_s.extend(re.findall(r'(?: |^).{0,200}[A-Z]{4,10}.{0,200}(?:[\. ]|$)', sentens))                
            else:
                sent = sentens
        else:
            if len(sent + sentens) >= 400:
                new_s.append(sent)
                sent = ''
                if len(sentens)>200 and len(sentens)<400:
                    new_s.append(sentens)
                elif len(sentens)>=400:
                    new_s.extend(re.findall(r'(?: |^).{0,150}[A-Z][a-z]{2,20} (?:(?:[A-Z][a-z]{2,20}|of|up|to|and|the|in|on|COVID-19|s|for|[0-9]{4}})[- \.,]){0,10}(?:[A-Z][a-z]{2,20})(?: data| survey| sample| study| [0-9]{2,4})*.{0,150}(?:[\. ]|$)', sentens))
                    new_s.extend(re.findall(r'(?: |^).{0,200}(?: [Dd]ata| [Rr]egistry|[Gg]enome [Ss]equence| [Mm]odel| [Ss]tudy| [Ss]urvey).{0,200}(?:[\. ]|$)', sentens))
                    new_s.extend(re.findall(r'(?: |^).{0,200}[A-Z]{4,10}.{0,200}(?:[\. ]|$)', sentens))    
                else:
                    sent = sentens
            else:
                sent = sent +'. ' + sentens
    if sent:
        new_s.append(sent)
    
    new_s_2 = []
    for s in new_s:
        
        a = re.findall(r'(?:(?:[A-Z][a-z]{2,20}|of|in|COVID-19|s|for|and) ){3,6}', s)
        
        a.extend(re.findall(r'(?: [Dd]ata| [Rr]egistry|[Gg]enome [Ss]equence| [Mm]odel| [Ss]tudy| [Ss]urvey)', s))
        a.extend(re.findall(r'[A-Z]{4,10}', s))
        if a:
            new_s_2.append(s)
    if new_s_2:
        t_x = [s.split() for s in new_s_2]
        valid_y = [[1]*len(x) for x in t_x]
        
        val_encodings = tokenizer(t_x, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True,max_length = 256)    
        val_labels = encode_tags(valid_y, val_encodings)    
        val_encodings.pop("offset_mapping")
        
        valid_dataset = VTBDataset(val_encodings,len(t_x),val_labels)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size_test, shuffle=False)
        
        valid = pd.DataFrame()
        valid['val_labels'] = val_encodings['input_ids']
        val_labels = len(val_encodings['input_ids'])
        len_val = len(val_encodings['input_ids'][0])

        valid_preds1 = np.zeros((val_labels,len_val), dtype = np.float32)
        valid_preds2 = np.zeros((val_labels,len_val), dtype = np.float32)

        avg_accuracy = 0.
        with torch.no_grad():
            for i,(batch)  in enumerate(valid_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels']
                outputs = model(input_ids, attention_mask=attention_mask, labels=None)

                logits1 = outputs[0][:,:,1].detach()
                logits1[labels<0] = -10
                logits2 = outputs[0][:,:,2].detach()
                logits2[labels<0] = -10                   
                valid_preds2[i*batch_size_test:(i+1)*batch_size_test,:]=logits2.cpu().numpy()  
                valid_preds1[i*batch_size_test:(i+1)*batch_size_test,:]=logits1.cpu().numpy()                             

    
    ans = []
    for index, row_1 in valid.iterrows():
    
        preds1, = np.where(valid_preds1[index]>1)
        preds2, = np.where(valid_preds2[index]>0)
        
        preds2.sort()
        max_a = 0
        g_all = []
        for min_a in preds1:
            if max_a > min_a:
                continue
            g = ''
            max_a = 0
            for min_b in preds2:
                if min_b>min_a:
                    max_a = min_b
                    break
            if max_a == 0 and valid_preds1[index][min_a]>2:
                max_a = min_a  
            if max_a-min_a > 10:
                continue
            if max_a>=min_a and min_a>0:
                k = 0
                  
                b = np.array(row_1["val_labels"])

                s = tokenizer.convert_ids_to_tokens(b[min_a:])
                for j,w in enumerate(s):
                    
                    if j<=max_a - min_a or "##" in w  :
                        g += w + ' '
                    else:    
                        break
            g = g.replace(" ##","").strip()
            
            if g and sum(map(str.isupper,g))/len(g.split())>0.5:
                g = clean_text(g)
                it_bad = False
                for w2 in g.split():
                    if w2 in bad_w: 
                        it_bad = True
                if not it_bad:
                    for w in g.split():
                        if w in good_word or len(g.split())==1:
                            g_all.append(g)
                            break            
    
        ans.extend(g_all)

    if not row['Id'] in sub:
        sub[row['Id']] = ''
    for s in ans:
        if len(sub[row['Id']]) and not s in sub[row['Id']]:
            it_bad = False
            for p in sub[row['Id']].split("|"):
                if p in s or Levenshtein.distance(p,s)/len(s)<0.2:
                    it_bad = True
            if not it_bad:              
                sub[row['Id']] = sub[row['Id']] + '|'+ s
        elif not len(sub[row['Id']]):
            sub[row['Id']] = s    


# Answer

In [None]:
submission = pd.DataFrame.from_dict(sub, orient='index', columns=['PredictionString']).reset_index()
submission.columns = ["Id","PredictionString"]
submission.to_csv('submission.csv', index=False)