# Overview

0. <a href='#imports'>Imports</a>
1. <a href='#data'>Data loading and processing</a>
2. <a href='#functions'>Relevant functions</a>
3. <a href='#sentence'>Sentence level</a>
    2. <a href='#flair'>Flair and TARS</a>
    3. <a href='#lstm'>LSTM model</a>
        1. <a href='#lstm_prediction'>Predict relevant sentences</a>
        2. <a href='#lstm_model'>Save Model</a>


<a id='imports'></a>
# Imports

In [None]:
%%capture
import numpy as np
import pandas as pd
import json
import os
import re
import pprint
from tqdm import tqdm
import spacy

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input, GlobalAveragePooling1D

from collections import Counter

import nltk
import tensorflow as tf
from fuzzywuzzy import fuzz

!pip install --upgrade git+https://github.com/zalandoresearch/flair.git
    
from flair.data import Corpus
from flair.datasets import SentenceDataset
from flair.trainers import ModelTrainer
from flair.models.text_classification_model import TARSClassifier
from flair.data import Sentence

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc, roc_curve
from numpy import argmax
import scipy.stats

<a id='data'></a>
# Data loading and processing

In [None]:
train_df_annotated = pd.read_pickle("../input/give-us-the-data-in-sentences/train_df_annotated.pkl")
train_df_annotated.head()

In [None]:
ds_titles = set([x.lower() for x in CI_train_df['dataset_label'].unique()] + 
                [x.lower() for x in CI_train_df['dataset_title'].unique()] + 
                [x.lower() for x in CI_train_df['cleaned_label'].unique()])
ds_titles = np.array(list(ds_titles))

In [None]:
train_df = pd.read_pickle("../input/give-us-the-data-in-sentences/train_df.pkl")
train_df.head()

In [None]:
sentences = []
for row in tqdm(train_df.itertuples(), total = train_df.shape[0]):
    clean_text = row.clean_text
    titles_list = [t for t in ds_titles if t in clean_text] 
    
    found_title = False
    sentence = row.sentence_text
    for title in titles_list:
        if title in sentence:
            found_title = True
            group = 'TP'
            sentences.append({'Id':row.Index,'sentence':sentence,'match':title,'group':group})
    if not found_title:
        group = 'N' if row.section_match == False else 'UNK'  
        sentences.append({'Id':row.Index,'sentence':sentence,'match':None,'group':group})

sentence_df = pd.DataFrame(sentences)
sentence_df['n_chars'] = sentence_df.sentence.str.len()

In [None]:
sentence_df.group.unique()

In [None]:
train_df.info()

<a id='functions'></a>
# Relevant functions

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
def get_text(sample_id, location='/kaggle/input/coleridgeinitiative-show-us-the-data/train/'):
    with open(location + sample_id + ".json", "r") as file:
        sample = json.loads(file.read())
    return " ".join([s['section_title'] + " "+ s['text'] for s in sample])

In [None]:
def tokenize(text, n_words=25000, sequence_length=25, tokenizer=None): 
    tokenizer = Tokenizer(num_words=n_words, filters=r'!"#$%&()*+,-.:;<=>?@[\]^_`{|}~', lower=True)
    tokenizer.fit_on_texts(text) #.values
    X = tokenizer.texts_to_sequences(text) #.values
    X = pad_sequences(X, maxlen=sequence_length, padding='pre')
    return X, tokenizer

<a id='sentence'></a>
# Sentences

In [None]:
sentence_df.to_pickle("./sentence_df.pkl")
sentence_df.head()

#### Analyze sentence data

In [None]:
sentence_df.sample(5).append(sentence_df[sentence_df.group == 'TP'].sample(5))

In [None]:
mulitply_negative = 10
TP_df = sentence_df[sentence_df.group == 'TP']
TP_df = TP_df.append(sentence_df[sentence_df.group == 'N'].sample(TP_df.shape[0] * mulitply_negative))
TP_df['clean'] = TP_df.sentence.str.lower().replace(r"[^a-z ]+","", regex=True)
TP_df['n_words'] = TP_df.clean.apply(lambda x: len(str(x).split()))
TP_df.head()

In [None]:
TP_df = TP_df.drop(TP_df[TP_df.n_words < 5].n_words.index, axis=0)
TP_df.head()

In [None]:
print(TP_df['group'].unique())
print(len(TP_df[TP_df.group == 'N']))

In [None]:
print ("Maximum number of characters in sentence:", TP_df.n_chars.max())
print ("Average number of characters in sentence:", int(TP_df.n_chars.mean()))
print ("Total number of characters in sentence:", int(TP_df.n_chars.sum()))
TP_df[TP_df.n_chars < 1000].n_chars.hist(bins=50);

In [None]:
print ("Maximum number of words in sentence:", TP_df.n_words.max())
print ("Average number of words in sentence:", int(TP_df.n_words.mean()))
print ("Total number of words in sentence:", int(TP_df.n_words.sum()))
TP_df[TP_df.n_words < 500].n_words.hist(bins=10);

#### Tokenize 

In [None]:
%time 
tqdm.pandas()
TP_df['tokenized'] = TP_df.clean.progress_apply(lambda x: [ \
    w for w in nltk.word_tokenize(x[:500])])

In [None]:
unique_words = Counter()
for words in tqdm(TP_df.tokenized.values):
    unique_words.update(words)
print (f"Unique words: {len(unique_words)}")    

In [None]:
min_occurencies = 10
min_word_len = 3
my_vocab = {k:v for k, v in unique_words.items() if v>=min_occurencies and len(k)>= min_word_len}
my_vocab = {k: v for k, v in sorted(my_vocab.items(), key=lambda item: item[1], reverse=True)}
vocab_size = len(my_vocab)
print(vocab_size)

In [None]:
n_words = vocab_size 
sequence_length = int(TP_df.n_chars.mean())

In [None]:
TP_df.clean

In [None]:
padded_df, tokenizer_ = tokenize(TP_df.clean, sequence_length=sequence_length, n_words=vocab_size)

In [None]:
x_train = padded_df
y_train = (TP_df.group == 'TP').astype(int)

In [None]:
sentence_df['clean'] = sentence_df.sentence.str.lower().replace(r"[^a-z ]+","", regex=True)

<a id='flair'></a>
## Flair and TARS

References:
* [https://kishaloyhalder.github.io/pdfs/tars_coling2020.pdf](https://kishaloyhalder.github.io/pdfs/tars_coling2020.pdf)
* [https://github.com/flairNLP/flair](https://github.com/flairNLP/flair)

In [None]:
chosen_idx_TP_0 = np.random.choice(len(TP_df[TP_df.group == 'TP']))
print(TP_df.clean.iloc[chosen_idx_TP_0])

In [None]:
unique_labels = TP_df.match.unique()
print(Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[0]].index)]))

In [None]:
# Corpus
train_corpus = SentenceDataset([
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[0]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[1]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[2]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[3]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[4]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[5]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[6]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[7]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[8]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[9]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[10]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[11]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[12]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[13]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[14]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[15]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[16]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[17]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[18]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[19]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[20]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[21]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[22]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[23]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[24]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[25]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[26]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[27]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[28]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[29]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[30]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[31]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[32]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[33]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[34]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[35]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[36]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[37]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[38]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[39]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[40]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean[np.random.choice(TP_df[TP_df.match == unique_labels[41]].index)]).add_label('TP_N', 'TP'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
    Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N')])

test_corpus = SentenceDataset([
        Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'TP']))]).add_label('TP_N', 'TP'),
        Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'TP']))]).add_label('TP_N', 'TP'),
        Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N'),
        Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))]).add_label('TP_N', 'N')])

corpus = Corpus(train=train_corpus, test=test_corpus)


In [None]:
%%script false
# TARS + FLAIR
tars = TARSClassifier.load('tars-base')
tars.add_and_switch_to_new_task("TP_or_N", label_dictionary=corpus.make_label_dictionary())

In [None]:
%%script false
trainer = ModelTrainer(tars, corpus)

In [None]:
%%script false
trainer.train(base_path='resources/taggers/tp_or_n',
              learning_rate=0.02,
              mini_batch_size=1,
              max_epochs=10,
              train_with_dev=True,
              )

In [None]:
%%script false
# Train model
tars = TARSClassifier.load('resources/taggers/tp_or_n/final-model.pt')

# Prepare a test sentence that includes dataset name
sentence = Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'TP']))])
tars.predict(sentence)
print(sentence)
print(sentence.labels[0])

sentence = Sentence(TP_df.clean.iloc[np.random.choice(len(TP_df[TP_df.group == 'N']))])
tars.predict(sentence)
print(sentence)
print(sentence.labels[0])

In [None]:
%%script false
TP_df_part = TP_df.sample(50).append(TP_df[TP_df.group == 'TP'].sample(50))#TP_df[:100]
TP_df_part.head()

In [None]:
%%script false
TP_df_part['prediction'] = 0
TP_df_part.head()

for row in tqdm(TP_df_part.itertuples()):
    idx = row.Index
    sentence = Sentence(TP_df_part.clean[idx])
    tars.predict(sentence)
    prediction_i = sentence.labels[0]
    label_i, pred_i = str(prediction_i).rstrip().split('(')
    pred_i, _ = str(pred_i).rstrip().split(')')
    if (TP_df_part.group[idx]=='N') & (label_i=='N'):
        pred_i = 1-pred_i
    elif (TP_df_part.group[idx]=='N') & (label_i=='TP'):
        pred_i = pred_i
    elif (TP_df_part.group[idx]=='TP') & (label_i=='TP'):
        pred_i = pred_i
    elif (TP_df_part.group[idx]=='TP') & (label_i=='N'):
        pred_i = 1-pred_i
    TP_df_part.loc[idx,'prediction'] = float(pred_i)#pred_i

In [None]:
%%script false
TP_df_part.sample(5).append(TP_df_part[TP_df_part.group == 'TP'].sample(5))

In [None]:
%%script false
print ("Maximum prediction of sentence:", TP_df_part.prediction.max())
print ("Minimum prediction of sentence:", TP_df_part.prediction.min())
print ("Average prediction of sentence:", TP_df_part.prediction.mean())
TP_df_part.prediction.hist(bins=10);

In [None]:
%%script false
TP_df_part.to_pickle("./part_flair_pred.pkl")

<a id='lstm'></a>
## LSTM model

In [None]:
class Transformer(layers.Layer):
    def __init__(self, embed_dim, num_heads, rate=0.1):
        super(Transformer, self).__init__()
        self.multihead = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.sequential = keras.Sequential([layers.Dense(32, activation="relu"), layers.Dense(embed_dim),])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-5)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-5)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        multihead_output = self.multihead(inputs, inputs)
        multihead_output = self.dropout1(multihead_output, training=training)
        norm_output = self.layernorm1(inputs + multihead_output)
        sequential_output = self.sequential(norm_output)
        drop_output = self.dropout2(sequential_output, training=training)
        norm_output = self.layernorm2(norm_output + drop_output)
        return norm_output

class PositionTokenEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(PositionTokenEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
embed_dim = 32 
num_heads = 2 

embedding_layers = PositionTokenEmbedding(sequence_length, vocab_size, embed_dim)
transformer_layers = Transformer(embed_dim, num_heads)
    
inputs = layers.Input(shape=(sequence_length,))
x = embedding_layers(inputs)
x = transformer_layers(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.15)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.15)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model_LSTM = keras.Model(inputs=inputs, outputs=outputs)
model_LSTM.summary()

model_LSTM.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,
                patience=0, verbose=1, mode='auto', baseline=None, restore_best_weights=True)

history_LSTM = model_LSTM.fit(x_train, y_train, epochs=1, batch_size=32, verbose=1, callbacks=callback)
history_LSTM.history.values()

<a id='lstm_prediction'></a>
### Predict relevant sentences 

In [None]:
set_threshold = 0.75

In [None]:
sentence_df.dtypes

In [None]:
sentence_df['clean_str'] = sentence_df.clean.apply(str)

In [None]:
%time 
tqdm.pandas()

text = list(sentence_df.clean_str)

padded_LSTM, tokenizer_LSTM = tokenize(text, sequence_length=sequence_length, n_words=vocab_size) 

sentence_df['tokens_tf'] = tokenizer_LSTM.texts_to_sequences(text)

y_pred_logits = model_LSTM.predict(padded_LSTM, batch_size=32)

y_pred_idx = y_pred_logits.argmax(axis=1)
y_pred = y_pred_logits[:,1] #second column is for dataset mention

sentence_df['pred_tf'] = y_pred 

threshold_pred = [1 for p in y_pred if p > set_threshold]

threshold_df = sentence_df[sentence_df['pred_tf'] > set_threshold]
        
print (f"identified {len(threshold_pred)} candidate sentences with possible known dataset titles") #y_pred_idx.sum()

threshold_df.head()

In [None]:
threshold_df.to_pickle("./lstm_sentence_df.pkl")

In [None]:
sentence_df.to_pickle("./sentence_df_pred.pkl")

Evaluate using different thresholds

In [None]:
fn_diff = 0
threshold_i = [0.0, 0.25, 0.5, 0.75, 0.8. 0.9]
for i in threshold_i:
    prediction = lstm_eval.lstm_prediction.apply(lambda x: x > i)
    lstm_eval_thres = lstm_eval.assign(class_pred = prediction)
    
    y_true = lstm_eval_thres.sentence_match.to_numpy()
    y_pred = lstm_eval_thres.lstm_prediction.to_numpy()

    tp = len(lstm_eval_thres.loc[(lstm_eval_thres.sentence_match) & (lstm_eval_thres.class_pred)])
    tn = len(lstm_eval_thres.loc[~(lstm_eval_thres.sentence_match) & ~(lstm_eval_thres.class_pred)])
    fp = len(lstm_eval_thres.loc[~(lstm_eval_thres.sentence_match) & (lstm_eval_thres.class_pred)])
    fn = len(lstm_eval_thres.loc[(lstm_eval_thres.sentence_match) & ~(lstm_eval_thres.class_pred)])
    
    print("threshold: ", i)
    
    if i == 0.:
        fn_diff = 0
        print("number of fn: ", fn, "/ fn percentage of whole data ", fn/len(y_true))
        print("fn_diff: ", fn_diff)
    else:
        print("number of fn: ", fn, "/ fn percentage of whole data ", fn/len(y_true))
        print("fn_diff: ", fn-fn_diff)
    fn_diff = fn
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print("accuracy: ", accuracy)
    
    print("--------------")

<a id='lstm_prediction'></a>
### Save model

In [None]:
model_LSTM.save("./")