In [None]:
import os
import re
import json
import string
import numpy as np
import pandas as pd

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
train_df=pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_df=train_df.groupby('Id')[['dataset_title', 'dataset_label']].agg(list).reset_index()

train_df.head()

In [None]:
def get_all_datalabels(row):
    dataset_title=row['dataset_title']
    dataset_label=row['dataset_label']
    
    all_labels=list(set(dataset_label+dataset_title))
    return all_labels
    
train_df['all_datalabels']=train_df.apply(get_all_datalabels, axis=1)
train_df.head()

In [None]:
database=set()
for labels in train_df['all_datalabels'].values:
    database=database.union(labels)
print('Number Of Datasets:', len(database))

In [None]:
def get_train_data(pub_id):
    pub_filename='../input/coleridgeinitiative-show-us-the-data/train/{}.json'.format(pub_id)
    with open(pub_filename) as file:
        data=json.load(file)
    return data

In [None]:
def get_word_properties(word):
    prop={}
    
    prop['word']=word
    prop['is_alpha']=False
    prop['is_title']=False
    prop['is_upper']=False
    prop['is_lower']=False
    prop['has_upper']=False
    prop['is_number']=False
    prop['is_stopword']=False
    prop['is_punct'] = False
    prop['alpha_1']=False
    
    
    if word.isalpha():
        prop['is_alpha']=True
    if word[0].isalpha():
        prop['alpha_1']=True
    if word.islower():
        prop['is_lower']=True
    if word.isupper():
        prop['is_upper']=True
        prop['has_upper']=True
    if word.lower() in STOP_WORDS:
        prop['is_stopword']=True
    if word.istitle():
        prop['is_title']=True
        prop['has_upper']=True
    if word.isnumeric():
        prop['is_number']=True
        
    if word in string.punctuation:
        prop['is_punct']=True
    
    if (prop['is_alpha']) and (not prop['has_upper']) and (not prop['is_lower']):
        for ch in word:
            if ch.isupper():
                prop['has_upper']=True
    return prop


def get_candidate_index(i, word_props, word_len, candidates):
    word=word_props[i]['word'].lower()
    if (candidates[i-1] == 0) and  (word_props[i]['has_upper']) and (not word_props[i]['is_stopword']):
        return 1
    
    if candidates[i-1]==1 or candidates[i-1]==2:
        if word_props[i]['is_punct'] and word in ['(', ')', '-']:
            return 2
        if word_props[i]['is_punct']:
            return 0
        if word_props[i]['has_upper']:
            return 2
        if word_props[i]['is_lower'] and (i+1 < word_len) and (word_props[i+1]['is_lower']):
            return 0
        if (word_props[i]['is_lower'] and (i+1 < word_len) and 
                (word_props[i+1]['has_upper']) and word in ['in', 'for', 'of', 'the', 'and']):
            return 2
    return 0
    
def get_candidate_entities(sentence):
    words=word_tokenize(sentence)
    words=[word.strip() for word in words]
    words=[word for word in words if len(word)>0]
    if words[0].isnumeric():
        words=words[1:]
    words_len=len(words)
    word_props=[]
    candidates=[0]*words_len
    cwords=[]
    if words_len <= 5:
        return (candidates, cwords)
    
    for word in words:
        prop=get_word_properties(word)
        word_props.append(prop)
    
    if (not word_props[0]['is_stopword']) and (word_props[0]['has_upper'] and (not word_props[1]['word']==',')):
        candidates[0]=1
    
    for i in range(1, words_len):
        prop=word_props[i]
        word=prop['word']
        candidates[i]=get_candidate_index(i, word_props, words_len, candidates)
    
    #Removing the first word sequence as candidate words.
    if candidates[0]==1:
        candidates[0]=0
        for i in range(1, words_len):
            if candidates[i]==1:
                break
            candidates[i]=0
    s=-1; e=-1
    for i in range(words_len):
        if candidates[i]==1 and s==-1:
            s=i
            e=i
        elif candidates[i]==1 and s!=-1:
            cwords.append(' '.join(words[s: e+1]))
            s=i
        elif candidates[i]==2:
            e=i
        elif s!=-1 and candidates[i]==0:
            cwords.append(' '.join(words[s: e+1]))
            s=-1
            e=-1
    cwords=[word for word in cwords if (len(word)>2) and (not word[0].isnumeric())]
    return (candidates, cwords)

In [None]:
def get_publications_data(pub_id):
    pub_filename='../input/coleridgeinitiative-show-us-the-data/test/{}'.format(pub_id)
    with open(pub_filename) as file:
        data=json.load(file)
    return data

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
def get_abbrevation(sentence_words, sentence_len, i):
    if i==sentence_len or sentence_words[i]!='(':
        return ''
    else:
        j=i
        while j<sentence_len:
            if sentence_words[j] ==')':
                break
            j+=1
        abbr=sentence_words[i+1: j]
        abbr=[word.strip() for word in abbr]
        abbr=[word for word in abbr if len(word)!=0]
        if len(abbr) ==1 and abbr[0] == 1:
            return ''
        if not abbr[0][0].isupper():
            return ''
    return ''.join(sentence_words[i: j+1])
        

def get_abbrevated_labels(sentence, all_labels):
    sentence_words=word_tokenize(sentence)
    sentence_words=[w for w in sentence_words if w!='']
    sentence_len=len(sentence_words)
    abbr_labels=[]
    for label in all_labels:
        label_words=word_tokenize(label)
        for i, sword in enumerate(sentence_words):
            flag=True
            for j, lword in enumerate(label_words):
                if sentence_words[i+j] != lword:
                    flag=False
                    break
            if flag:
                abbr=get_abbrevation(sentence_words, sentence_len, i+len(label_words))
                abbr_labels.append(label+" "+abbr)
                break
    return abbr_labels

In [None]:
exclude_entities=['table', 'fig', 'provide','data','mri', 'result']

In [None]:
def get_new_entities(candidates, all_labels, sentence):
    new_ents=[]
    if len(all_labels)==0:
        return []
    sent_words=word_tokenize(sentence)
    words_len=len(sent_words)
    entity_markers=[0]*words_len
    ent_id=1
    
    try:
        for label in all_labels:
            lwords=word_tokenize(label)
            for i in range(words_len):
                flag=True
                for j in range(len(lwords)):
                    if lwords[j] != sent_words[i+j]:
                        flag=False
                        break
                if flag:
                    for k in range(i, i+len(lwords)):
                        entity_markers[k]=ent_id
                    ent_id+=1
        for i, em in enumerate(entity_markers):
            if (em==0):
                continue
            tol=1
            if (i-1)>=0 and entity_markers[i-1]==0 and sent_words[i-1]==',':
                s=-1;e=-1
                for j in range(i-2, -1, -1):
                    if tol==0 or entity_markers[j]!=0:
                        s=-1;e=-1;
                        break
                    if e==-1 and candidates[j]==0 and sent_words[j]!=')':
                        tol-=1

                    if s!=-1 and candidates[j]==1:
                        s=j
                        break

                    if e==-1 and candidates[j]!=0:
                        e=j;s=j;tol=2;
                        if candidates[j]==1:
                            break

                    if candidates[j]==2:
                        s=j
                if s!=-1:
                    new_ents.append(sent_words[s:e+1])


            tol=1
            if (i+1)<words_len and entity_markers[i+1]==0 and (sent_words[i+1]==',' or sent_words[i+1]=='and'):
                s=-1; e=-1
                for j in range(i+2, words_len):
                    if tol==0 or entity_markers[j]!=0:
                        s=-1; e=-1
                        break
                    if s==-1 and candidates[j]==0 and sent_words[j]!='(':
                        tol-=1
                    if s==-1 and candidates[j]!=0:
                        s=j; e=j; tol=2
                    if s!=-1 and (candidates[j]==1 or candidates[j]==0):
                        e=j-1
                        break
                    else:
                        e=j
                if s!=-1:
                    new_ents.append(sent_words[s:e+1])
    except:
        pass
    
    new_ents_final=[]
    for ent in new_ents:
        flag=True
        for f in exclude_entities:
            if f in ent:
                flag=False
                break
        if flag:
            new_ents_final.append(ent)
    return new_ents_final

def get_datalabels(pub_id):
    data=get_publications_data(pub_id)
    all_labels=[]
    new_entities=[]
    for section in data:
        text=section['text']
        sentences=sent_tokenize(text)
        for sentence in sentences:
            sentence_labels=[]
            for label in database:
                if label in sentence:
                    sentence_labels.append(label)
            if len(sentence_labels) != 0:
                abrevation_labels=get_abbrevated_labels(sentence, sentence_labels)
                all_labels+=sentence_labels
                all_labels+=abrevation_labels
                
                #Taken from https://www.kaggle.com/tungmphung/pytorch-bert-for-named-entity-recognition/comments
                if any( [word in sentence.lower() for word in ['data', 'study', 'database','taken from']] ): 
                    (candidates, cwords)=get_candidate_entities(sentence)
                    new_entities+=get_new_entities(candidates, all_labels, sentence)
    return (all_labels, new_entities)

# Submission

In [None]:
all_data=[]
for filename in os.listdir('../input/coleridgeinitiative-show-us-the-data/test'):
    (all_labels, new_entities)=get_datalabels(filename)
    datalabels=all_labels+new_entities
    Id=filename.replace('.json', '')
    datalabels=list(set([clean_text(label).strip() for label in datalabels]))
    datalabels.sort()
    
    predictionString='|'.join(datalabels)
    all_data.append({
        'Id': Id,
        'PredictionString': predictionString
    })

In [None]:
submission_df=pd.DataFrame.from_dict(all_data)
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)