In [None]:
import numpy as np 
import pandas as pd 
import json
import seaborn as sns
import re
import nltk
import io

import spacy
from spacy import displacy

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from tqdm.notebook import tqdm
tqdm.pandas()


In [None]:
train_sample = pd.concat([pd.read_pickle("../input/coleridge-ner-11-train/training_df.pkl"), pd.read_pickle("../input/coleridge-ner-11-train/validation_df.pkl")]).sample(frac = 1, random_state = 42)

In [None]:
train_sample.to_pickle('train_sample.pkl')

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', "tok2vec", "attribute_ruler", "lemmatizer", "textcat"]) 
nlp.max_length = 3000000
words = set([w for w in nlp(" ".join([i for i in train_sample.cleaned_text_training]))])

In [None]:
print("Unique words in corpus : {}".format(len(words)))

In [None]:
stops = stopwords.words('english')
words = [str(w) for w in words if w not in set(stops)]
#words

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 250000
oov_token = "<OOV>" #out of vocabulary token

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(words) #train_sample.cleaned_text_training.values)

In [None]:
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [None]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
training_sequences = tokenizer.texts_to_sequences(train_sample.cleaned_text_training)
#validation_sequences = tokenizer.texts_to_sequences(val_sample.cleaned_text_training)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 60
padding_type = 'post'
trunc_type = 'post'

training_padded = pad_sequences(training_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)
#validation_padded = pad_sequences(validation_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [None]:
training_labels = train_sample.BILUO_labels.values
#validation_labels = val_sample.BILUO_labels.values

In [None]:
label_tokenizer = Tokenizer(filters = ' ')

tok_tr_labels = [" ".join(i) for i in training_labels]
label_tokenizer.fit_on_texts(tok_tr_labels)
train_y = label_tokenizer.texts_to_sequences(tok_tr_labels)
train_y = pad_sequences(train_y, maxlen = max_length, padding = padding_type, truncating = trunc_type, value = label_tokenizer.word_index['o']) - 1 #subtracting 1 to use the 'to_categorical' method

#tok_val_labels = [" ".join(i) for i in validation_labels]
#val_y = label_tokenizer.texts_to_sequences(tok_val_labels)
#val_y = pad_sequences(val_y, maxlen = max_length, padding = padding_type, truncating = trunc_type, value = label_tokenizer.word_index['o']) - 1

In [None]:
label_tokenizer_json = label_tokenizer.to_json()
with io.open('label_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(label_tokenizer_json, ensure_ascii=False))

In [None]:
train_y.shape

In [None]:
label_tokenizer.word_index

In [None]:
training_pos_labels = train_sample.pos_labels.values
#validation_pos_labels = val_sample.pos_labels.values

In [None]:
pos_tokenizer = Tokenizer(filters = ' ')
pos_tr_labels = [" ".join(i) for i in training_pos_labels]
pos_tokenizer.fit_on_texts(pos_tr_labels)
train_pos = pos_tokenizer.texts_to_sequences(pos_tr_labels)
train_pos = pad_sequences(train_pos, maxlen = max_length, padding = padding_type, truncating = trunc_type)

#pos_val_labels = [" ".join(i) for i in validation_pos_labels]
#val_pos = pos_tokenizer.texts_to_sequences(pos_val_labels)
#val_pos = pad_sequences(val_pos, maxlen = max_length, padding = padding_type, truncating = trunc_type)


In [None]:
pos_tokenizer_json = pos_tokenizer.to_json()
with io.open('pos_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(pos_tokenizer_json, ensure_ascii=False))

In [None]:
from tensorflow.keras.utils import to_categorical

train_y_cat = to_categorical(train_y, num_classes = len(label_tokenizer.word_index))
#val_y_cat = to_categorical(val_y, num_classes = len(label_tokenizer.word_index))

#train_pos_cat = to_categorical(train_pos, num_classes = len(pos_tokenizer.word_index))
#val_pos_cat = to_categorical(val_pos, num_classes = len(pos_tokenizer.word_index))

In [None]:
trainlabtot = [i for j in train_sample.BILUO_labels for i in j]
#vallabtot = [i for j in val_sample.BILUO_labels for i in j]



#print(set(trainlabtot), set(vallabtot)) 

no_of_b = sum(np.array(trainlabtot) == 'B')
no_of_i = sum(np.array(trainlabtot) == 'I')
no_of_l = sum(np.array(trainlabtot) == 'L')
no_of_u = sum(np.array(trainlabtot) == 'U')
no_of_o = sum(np.array(trainlabtot) == 'O')

tot = no_of_b + no_of_i + no_of_l + no_of_u + no_of_o

print("B : {} - {}%".format(no_of_b, round(no_of_b/tot, 4)))
print("I : {} - {}%".format(no_of_i, round(no_of_i/tot, 4)))
print("L : {} - {}%".format(no_of_l, round(no_of_l/tot, 4)))
print("U : {} - {}%".format(no_of_u, round(no_of_u/tot, 4)))
print("O : {} - {}%".format(no_of_o, round(no_of_o/tot, 4)))

## MODEL

In [None]:
#!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
#from keras_contrib.layers import CRF

In [None]:
DROPOUT = 0.4

OUTPUT_LENGTH = len(label_tokenizer.word_index)

#input for word embedding
input_word = Input(shape = (max_length,), name = 'input_word')#

#input for pos embedding
input_pos = Input(shape = (max_length,), name = 'input_pos')

#word embedding layer
word_embed = Embedding(input_dim = vocab_size, output_dim = max_length, input_length = max_length, name = 'word_embedding')(input_word)

#pos embedding layer
pos_embed = Embedding(input_dim = len(pos_tokenizer.word_index) + 1, output_dim = max_length, input_length = max_length, name = 'pos_embedding')(input_pos) #+1 to match the embedding 

#joining the two LSTMs
conc = Concatenate()([word_embed, pos_embed])

#dropout layer
model = SpatialDropout1D(DROPOUT)(conc)

#double BLSTM
model = Bidirectional(LSTM(units = 50, return_sequences = True, recurrent_dropout = DROPOUT), name = 'word_LSTM')(model)
model = Bidirectional(LSTM(units = 50, return_sequences = True, recurrent_dropout = DROPOUT, name = 'pos_LSTM'))(model)

#conv layer later?

#output
out = TimeDistributed(Dense(OUTPUT_LENGTH, activation = 'softmax'))(model)

#model
model = Model([input_word, input_pos], out)

model.summary()

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
#from livelossplot.tf_keras import PlotLossesCallback

In [None]:
label_tokenizer.word_index

In [None]:
def calc_class_weights(class_props, n_classes, scale = None):
    if scale == 'log':
        weights = np.log(1 / class_props)
    else: 
        max_prop = np.max(class_props)
        weights = max_prop / class_props
    return weights


#B : 4090 - 0.0103%
#I : 11517 - 0.0289%
#L : 4090 - 0.0103%
#U : 312 - 0.0008%
#O : 378285 - 0.9498%

class_weights = calc_class_weights([0.9498, 0.0289, 0.0103, 0.0103, 0.0008], 5)

In [None]:
from keras import backend as K


def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy

    Variables:
        weights: numpy array of shape (C,) where C is the number of classes

    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """

    weights = K.variable(weights)

    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss

    return loss

In [None]:
w_categorical_crossentropy = weighted_categorical_crossentropy(class_weights)

In [None]:
train_sample.iloc[0].text

In [None]:
trainmodel = True

BATCH_SIZE = 8
EPOCHS = 15

model.compile(optimizer =  'adam', 
              loss = w_categorical_crossentropy, # 'categorical_crossentropy', 
              metrics = ['accuracy',f1_m, precision_m, recall_m])

if trainmodel:

    #early_stopping = EarlyStopping(monitor = 'val_f1_m', patience = 1, verbose = 0, mode='max', restore_best_weights = True)

    #callbacks = [early_stopping]

    history = model.fit(
        [training_padded, train_pos], np.array(train_y_cat),
        #validation_data = ([validation_padded, val_pos], np.array(val_y_cat)),
        batch_size = BATCH_SIZE,
        epochs = EPOCHS,
        verbose = 1,
        #callbacks = callbacks

        )

    model.save('./model4.h5')
    
else:
    model.load_weights("../input/coleridge-ner-5-train/model2.h5")

In [None]:
from matplotlib import pyplot as plt
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])

In [None]:
plt.plot(history.history['f1_m'])
#plt.plot(history.history['val_f1_m'])

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
#stops = set(stopwords.words('english')).difference(['in', 'from', 'on', 'of', 's', 'at'])

def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    text_cleaned = re.sub('[^A-Za-z0-9()-]+', ' ', str(txt)).strip()
    
    return text_cleaned #" ".join([i for i in text_cleaned.split() if i not in stops])

In [None]:
def break_sentence(sentence, max_sentence_length, overlap):
    
    words = sentence.split()
    
    sentence_length = len(words)
    
    if sentence_length <= max_sentence_length:
        return [sentence]
    
    else:
        broken_sentences = []
        
        for p in range(0, sentence_length, max_sentence_length - overlap):
            broken_sentences.append(" ".join(words[p:p + max_sentence_length]))
            
        return broken_sentences

In [None]:
def disambiguate_entities(entities_list):
    
    """
    This function, in case the string representing one entity contains some other entity in the list,
    will include only the longest one.
    """
    
    entities_list = list(set(entities_list))
    
    final_list = []
    
    for e in range(len(entities_list)):
        if entities_list[e] not in " ".join(entities_list[:e]) + " ".join(entities_list[e+1:]):
            final_list.append(entities_list[e])
            
    return final_list

In [None]:
label_tokenizer.word_index

In [None]:
def predict_dataset(paper_test_sentences, paper_sentences_pos, print_warn_message = False):
    
    #preparing data for prediction
    tok = tokenizer.texts_to_sequences(paper_test_sentences)
    pad = pad_sequences(tok, maxlen = max_length, padding = padding_type, truncating = trunc_type)
    
    pos_tok = pos_tokenizer.texts_to_sequences([" ".join(i) for i in paper_sentences_pos])
    pos_pad = pad_sequences(pos_tok, maxlen = max_length, padding = padding_type, truncating = trunc_type)
                
    pred = model.predict([pad, pos_pad], batch_size = BATCH_SIZE)
        
    pred_lab = np.argmax(pred, axis = -1)
    
    predtexts = []
    
    #mapping predictions
    for p_idx, p in enumerate(pred_lab):
        predictiontext = ''
        predictionlabels = []
        if len(set([1,2,3,4]).intersection(set(p)))>0:
            #print(p, paper_test_sentences[p_idx])
            for l in range(len(p)):
                if p[l] > 0:
                    #print(p_idx, predictiontext, tok[p_idx], len(p), len(tok[p_idx]))
                    
                    try:
                        if len(predictiontext)==0:
                            predictiontext += reverse_word_index[tok[p_idx][l]]
                        else:
                            if reverse_word_index[tok[p_idx][l]] not in predictiontext:
                                predictiontext += " {}".format(reverse_word_index[tok[p_idx][l]])
                        predictionlabels.append(p[l])
                        
                    except IndexError:
                        
                        if print_warn_message:
                            print("Sentence: {}".format(paper_test_sentences[p_idx]), "The model attempted to assign a 'I' or 'B' to a padded character")
                        pass

        else:
            predictiontext = ""
            
            
        if len(predictionlabels) >0:
            
            write = False
            
            
            
            if len(predictionlabels) == 1: #if there's only one relevant label, that should be a 'U'. Otherwise avoid producing result
                if predictionlabels == label_tokenizer.word_index['u']-1:
                    write = True
                    #predtexts.append(clean_text(predictiontext))
            
                #if there are multiple relevant labels
            elif label_tokenizer.word_index['l']-1 in predictionlabels or label_tokenizer.word_index['i']-1: #if there's end of sentence or middle of sentence
                if label_tokenizer.word_index['b']-1 in predictionlabels: #there must be the beginning as well
                    write = True
                    
            if write:
                print(predictiontext, predictionlabels, paper_test_sentences[p_idx], list(zip(p, [t for t in nlp(paper_test_sentences[p_idx])])))
                predtexts.append(clean_text(predictiontext))
                        
                #if label_tokenizer.word_index['b']-1 in predictionlabels: #else, if there's the beginning, it will suffice for producing the text (to be improved)
                #predtexts.append(clean_text(predictiontext))
        
    return predtexts

In [None]:
def predict_dataset(paper_test_sentences, paper_sentences_pos, print_warn_message = False, string_matching = False, existing_labels = []):
    
    #preparing data for prediction
    tok = tokenizer.texts_to_sequences(paper_test_sentences)
    pad = pad_sequences(tok, maxlen = max_length, padding = padding_type, truncating = trunc_type)
    
    pos_tok = pos_tokenizer.texts_to_sequences([" ".join(i) for i in paper_sentences_pos])
    pos_pad = pad_sequences(pos_tok, maxlen = max_length, padding = padding_type, truncating = trunc_type)
                
    pred = model.predict([pad, pos_pad], batch_size = BATCH_SIZE)
        
    pred_lab = np.argmax(pred, axis = -1)
    
    predtexts = []
    
    #mapping predictions
    for p_idx, p in enumerate(pred_lab):
        predictiontext = ''
        predictionlabels = []
        if len(set([1,2,3,4]).intersection(set(p)))>0:
            #print(p, paper_test_sentences[p_idx])
            for l in range(len(p)):
                if p[l] > 0:
                    #print(p_idx, predictiontext, tok[p_idx], len(p), len(tok[p_idx]))
                    
                    try:
                        if len(predictiontext)==0:
                            predictiontext += reverse_word_index[tok[p_idx][l]]
                        else:
                            if reverse_word_index[tok[p_idx][l]] not in predictiontext:
                                predictiontext += " {}".format(reverse_word_index[tok[p_idx][l]])
                        predictionlabels.append(p[l])
                        
                    except IndexError:
                        
                        if print_warn_message:
                            print("Sentence: {}".format(paper_test_sentences[p_idx]), "The model attempted to assign a 'I' or 'B' to a padded character")
                        pass

        else:
            predictiontext = ""
            
            
        if len(predictionlabels) >0:
            
            write = False
            
            
            
            if len(predictionlabels) == 1: #if there's only one relevant label, that should be a 'U'. Otherwise avoid producing result
                if predictionlabels == label_tokenizer.word_index['u']-1:
                    write = True
                    #predtexts.append(clean_text(predictiontext))
            
                #if there are multiple relevant labels
            elif label_tokenizer.word_index['l']-1 in predictionlabels or label_tokenizer.word_index['i']-1: #if there's end of sentence or middle of sentence
                if label_tokenizer.word_index['b']-1 in predictionlabels: #there must be the beginning as well
                    write = True
                    
            if write:
                #print(predictiontext, predictionlabels, paper_test_sentences[p_idx], list(zip(p, [t for t in nlp(paper_test_sentences[p_idx])])))
                predtexts.append(clean_text(predictiontext))
                        
                #if label_tokenizer.word_index['b']-1 in predictionlabels: #else, if there's the beginning, it will suffice for producing the text (to be improved)
                #predtexts.append(clean_text(predictiontext))
    if string_matching:
        for txt in paper_test_sentences:
            for known_label in existing_labels:
                
                labelset = set(clean_training_text(known_label).lower().split())
                
                if len(labelset.intersection(set(clean_training_text(txt).lower().split()))) == len(labelset):
                    #print(predtexts)
                    predtexts.append(clean_text(known_label))
        
    return predtexts

In [None]:
def pos_tagging_nltk(x):
    
    tok = word_tokenize(x)
    
    pos = nltk.pos_tag(tok)
    
    #print(x)
    return list(zip(*pos))[1] #[nlp_feat[w].pos_ for w in range(len(nlp_feat))]


def pos_tagging(x):
    
    nlp_feat = nlp(x)
    return [token.pos_ for token in nlp_feat]

In [None]:

    
overlap = 20 #number of overlapping words in case a sentence is broken in more sentences


include_string_matching = False

test_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
test = pd.read_csv(test_path)

test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
#test_sentences_dict = {}
#test_sentences_dict['text'] = []
#test_sentences_dict['Id'] = []


for paper_id in test['Id'].unique():
    
    paper_test_sentences = []
    paper_sentences_pos = []
    predtexts = []
    
    with open(f'{test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        #predicted_text_list = []
        for section in paper:
            
            section_name = section['section_title']
            
            if section_name.lower() not in (): #'acknowledgements', 'acknowledgement', 'reference', 'references'):
            
                text = section['text']
                #print("-------------------------------------------")
                
                for sentence in sent_tokenize(text):

                    for sub_sentence in break_sentence(sentence, max_length, overlap):

                        sub_sentence = clean_training_text(sub_sentence)
                        
                        if len(sub_sentence)>0:
                            #sentence_pos = pos_tagging(sub_sentence)

                            paper_test_sentences.append(sub_sentence)
                            #paper_sentences_pos.append(sentence_pos)
                            
    
    for txt in nlp.pipe(paper_test_sentences, disable=['ner', 'parser', "tok2vec", "attribute_ruler", 
                                "lemmatizer", "textcat", "attribute_ruler", "senter",
                                "sentencizer", "tok2vec"]):
        paper_sentences_pos.append([token.pos_ for token in txt])
        
    #print(paper_test_sentences)
                    
    predtexts = predict_dataset(paper_test_sentences, paper_sentences_pos)
    #print(predtexts)
    
    
    
    test.loc[test.Id == paper_id, 'PredictionString'] = "|".join(set(predtexts).difference(set([""])))

In [None]:
test.PredictionString.values

In [None]:
test

In [None]:
#test.to_csv('submission.csv')

In [None]:
test.to_csv('submission.csv', index=False)