In [None]:
import numpy as np 
import pandas as pd 
import json
import seaborn as sns
import re
import nltk

import spacy
from spacy import displacy

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm.notebook import tqdm
tqdm.pandas()


In [None]:
train_sample = pd.read_pickle("../input/coleridge-ner-11b-train-full-dataset/train_sample.pkl")

In [None]:
#load tokenizers

with open('../input/coleridge-ner-11b-train-full-dataset/tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
    
with open('../input/coleridge-ner-11b-train-full-dataset/label_tokenizer.json') as f:
    data = json.load(f)
    label_tokenizer = tokenizer_from_json(data)
    
with open('../input/coleridge-ner-11b-train-full-dataset/pos_tokenizer.json') as f:
    data = json.load(f)
    pos_tokenizer = tokenizer_from_json(data)

In [None]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
vocab_size = 250000

In [None]:
max_length = 60
padding_type = 'post'
trunc_type = 'post'

## MODEL

In [None]:
#!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
#from keras_contrib.layers import CRF

In [None]:
DROPOUT = 0.4

OUTPUT_LENGTH = len(label_tokenizer.word_index)

#input for word embedding
input_word = Input(shape = (max_length,), name = 'input_word')#

#input for pos embedding
input_pos = Input(shape = (max_length,), name = 'input_pos')

#word embedding layer
word_embed = Embedding(input_dim = vocab_size, output_dim = max_length, input_length = max_length, name = 'word_embedding')(input_word)

#pos embedding layer
pos_embed = Embedding(input_dim = len(pos_tokenizer.word_index) + 1, output_dim = max_length, input_length = max_length, name = 'pos_embedding')(input_pos) #+1 to match the embedding 

#joining the two LSTMs
conc = Concatenate()([word_embed, pos_embed])

#dropout layer
model = SpatialDropout1D(DROPOUT)(conc)

#double BLSTM
model = Bidirectional(LSTM(units = 50, return_sequences = True, recurrent_dropout = DROPOUT), name = 'word_LSTM')(model)
model = Bidirectional(LSTM(units = 50, return_sequences = True, recurrent_dropout = DROPOUT, name = 'pos_LSTM'))(model)

#conv layer later?

#output
out = TimeDistributed(Dense(OUTPUT_LENGTH, activation = 'softmax'))(model)

#model
model = Model([input_word, input_pos], out)

model.summary()

In [None]:
trainmodel = False

BATCH_SIZE = 8
EPOCHS = 15



if trainmodel:
    
    model.compile(optimizer =  'adam', 
              loss = w_categorical_crossentropy, # 'categorical_crossentropy', 
              metrics = ['accuracy',f1_m, precision_m, recall_m])

    #early_stopping = EarlyStopping(monitor = 'val_f1_m', patience = 1, verbose = 0, mode='max', restore_best_weights = True)

    #callbacks = [early_stopping]

    history = model.fit(
        [training_padded, train_pos], np.array(train_y_cat),
        #validation_data = ([validation_padded, val_pos], np.array(val_y_cat)),
        batch_size = BATCH_SIZE,
        epochs = EPOCHS,
        verbose = 1,
        #callbacks = callbacks

        )

    model.save('./model4.h5')
    
else:
    model.load_weights("../input/coleridge-ner-11b-train-full-dataset/model4.h5")

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
#stops = set(stopwords.words('english')).difference(['in', 'from', 'on', 'of', 's', 'at'])

def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    text_cleaned = re.sub('[^A-Za-z0-9()-]+', ' ', str(txt)).strip()
    
    return text_cleaned #" ".join([i for i in text_cleaned.split() if i not in stops])

In [None]:
def break_sentence(sentence, max_sentence_length, overlap):
    
    words = sentence.split()
    
    sentence_length = len(words)
    
    if sentence_length <= max_sentence_length:
        return [sentence]
    
    else:
        broken_sentences = []
        
        for p in range(0, sentence_length, max_sentence_length - overlap):
            broken_sentences.append(" ".join(words[p:p + max_sentence_length]))
            
        return broken_sentences

In [None]:
def disambiguate_entities(entities_list):
    
    """
    This function, in case the string representing one entity contains some other entity in the list,
    will include only the longest one.
    """
    
    entities_list = list(set(entities_list))
    
    final_list = []
    
    for e in range(len(entities_list)):
        if entities_list[e] not in " ".join(entities_list[:e]) + " ".join(entities_list[e+1:]):
            final_list.append(entities_list[e])
            
    return final_list

In [None]:
label_tokenizer.word_index

In [None]:
def predict_dataset(paper_test_sentences, paper_sentences_pos, print_warn_message = False, string_matching = False, existing_labels = []):
    
    #preparing data for prediction
    tok = tokenizer.texts_to_sequences(paper_test_sentences)
    pad = pad_sequences(tok, maxlen = max_length, padding = padding_type, truncating = trunc_type)
    
    pos_tok = pos_tokenizer.texts_to_sequences([" ".join(i) for i in paper_sentences_pos])
    pos_pad = pad_sequences(pos_tok, maxlen = max_length, padding = padding_type, truncating = trunc_type)
                
    pred = model.predict([pad, pos_pad], batch_size = BATCH_SIZE)
        
    pred_lab = np.argmax(pred, axis = -1)
    
    predtexts = []
    
    #mapping predictions
    for p_idx, p in enumerate(pred_lab):
        predictiontext = ''
        predictionlabels = []
        predictionindexes = []
        
        if len(set([1,2,3,4]).intersection(set(p)))>0:
            
            split_sentence = paper_test_sentences[p_idx].split()
            
            for l in range(len(p)):
                if p[l] > 0:
                    #print(p_idx, predictionlabels, predictiontext, tok[p_idx], len(p), len(tok[p_idx]))
                    
                    try:
                        if len(predictiontext)==0:
                            predictiontext += split_sentence[l] #reverse_word_index[tok[p_idx][l]]
                        else:
                            if reverse_word_index[tok[p_idx][l]] not in predictiontext:
                                predictiontext += " {}".format(split_sentence[l])#reverse_word_index[tok[p_idx][l]])
                        predictionlabels.append(p[l])
                        predictionindexes.append(l)
                        
                    except IndexError:
                        
                        if print_warn_message:
                            print("Sentence: {}".format(paper_test_sentences[p_idx]), "The model attempted to assign a 'I' or 'B' to a padded character")
                        pass

        else:
            predictiontext = ""
            
            
        if len(predictionlabels) >0:
            
            write = False
            
            
            if len(predictionlabels) == 1: #if there's only one relevant label, that should be a 'U'. Otherwise avoid producing result
                if predictionlabels == label_tokenizer.word_index['u']-1:
                    write = True
                    #predtexts.append(clean_text(predictiontext))
            
            else:#if there are multiple relevant labels
                if label_tokenizer.word_index['i']-1 in predictionlabels: #if there's end of sentence or middle of sentence
                    try:
                        if label_tokenizer.word_index['b']-1 in predictionlabels: #there must be the beginning as well
                            if predictionlabels.index(label_tokenizer.word_index['b']-1) < predictionlabels.index(label_tokenizer.word_index['i']-1):
                                write = True
                                
                    except ValueError:
                        pass
                            
                if label_tokenizer.word_index['l']-1 in predictionlabels:
                    try:
                        if predictionlabels.index(label_tokenizer.word_index['l']-1) < predictionlabels.index(label_tokenizer.word_index['i']-1):
                                write = True
                    except ValueError:
                        pass
                    
            if write:
                #print(predictiontext, predictionlabels, paper_test_sentences[p_idx], list(zip(p, [t for t in nlp(paper_test_sentences[p_idx])])))
                predtexts.append(clean_text(predictiontext))
                        
                #if label_tokenizer.word_index['b']-1 in predictionlabels: #else, if there's the beginning, it will suffice for producing the text (to be improved)
                #predtexts.append(clean_text(predictiontext))
    if string_matching:
        for txt in paper_test_sentences:
            for known_label in existing_labels:
                
                labelset = set(clean_training_text(known_label).lower().split())
                
                if len(labelset.intersection(set(clean_training_text(txt).lower().split()))) == len(labelset):
                    #print(predtexts)
                    predtexts.append(clean_text(known_label))
        
    return predtexts

In [None]:
def pos_tagging_nltk(x):
    
    tok = word_tokenize(x)
    
    pos = nltk.pos_tag(tok)
    
    #print(x)
    return list(zip(*pos))[1] #[nlp_feat[w].pos_ for w in range(len(nlp_feat))]


def pos_tagging(x):
    
    nlp_feat = nlp(x)
    return [token.pos_ for token in nlp_feat]

In [None]:
nlp = spacy.load('en_core_web_sm') 
    
overlap = 20 #number of overlapping words in case a sentence is broken in more sentences


include_string_matching = False

test_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
test = pd.read_csv(test_path)

test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
#test_sentences_dict = {}
#test_sentences_dict['text'] = []
#test_sentences_dict['Id'] = []


for paper_id in test['Id'].unique():
    
    paper_test_sentences = []
    paper_sentences_pos = []
    predtexts = []
    
    with open(f'{test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        #predicted_text_list = []
        for section in paper:
            
            section_name = section['section_title']
            
            if section_name.lower() not in (): #'acknowledgements', 'acknowledgement', 'reference', 'references'):
            
                text = section['text']
                #print("-------------------------------------------")
                
                for sentence in sent_tokenize(text):

                    for sub_sentence in break_sentence(sentence, max_length, overlap):

                        sub_sentence = clean_training_text(sub_sentence)
                        
                        if len(sub_sentence)>0:
                            #sentence_pos = pos_tagging(sub_sentence)

                            paper_test_sentences.append(sub_sentence)
                            #paper_sentences_pos.append(sentence_pos)
                            
    
    for txt in nlp.pipe(paper_test_sentences, disable=['ner', 'parser', "tok2vec", "attribute_ruler", 
                                "lemmatizer", "textcat", "attribute_ruler", "senter",
                                "sentencizer", "tok2vec"]):
        paper_sentences_pos.append([token.pos_ for token in txt])
        
    #print(paper_test_sentences)
                    
    predtexts = predict_dataset(paper_test_sentences, paper_sentences_pos)
    #print(predtexts)
    
    
    
    test.loc[test.Id == paper_id, 'PredictionString'] = "|".join(set(predtexts).difference(set([""])))

In [None]:
test.PredictionString.values

In [None]:
test

In [None]:
#test.to_csv('submission.csv')

In [None]:
test.to_csv('submission.csv', index=False)