### Sentence-level analysis using Transformer model in Keras
> Baseline from Poe Dator. See [here](https://www.kaggle.com/poedator/sentence-level-analysis-with-transformer/)

In [None]:
import os
import re
import json
import pickle
from collections import defaultdict, Counter
import gc

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
tqdm.pandas()

%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # full screen width of Jupyter notebook
pd.options.display.max_rows, pd.options.display.max_columns = 500, 100

# NLP imports
import nltk

# Neural network imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print( 'tf version:', tf.__version__)

os.environ['PYTHONHASHSEED']=str(123)
tf.random.set_seed(123)
np.random.seed(123)

In [None]:
%%time
""" Loading data"""
data_path = '../input/coleridgeinitiative-show-us-the-data/'

def read_json_from_folder(folder_name):
    json_dict = {}
    for filename in os.listdir(folder_name):
        with open(os.path.join(folder_name, filename)) as f:
            json_dict[filename[:-5]] = json.load(f)
    return json_dict

# train_dict = read_json_from_folder(os.path.join(data_path, 'train'))
# test_dict = read_json_from_folder(os.path.join(data_path, 'test'))
# train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
train_df = pd.read_csv('../input/ci-train-with-ext-data/train_df.csv')
sample_sub = pd.read_csv(os.path.join(data_path,'sample_submission.csv'))
    
# len(train_dict), len(test_dict), 
train_df.shape, sample_sub.shape

In [None]:
train_df['ext_cleaned_label'] = train_df['ext_cleaned_label'].apply(lambda x: x.split('|'))
train_df.drop_duplicates(subset='Id', keep='last', inplace=True)
print(train_df.shape)
train_df.sample(5)

In [None]:
pub_df = train_df.set_index('Id')[['pub_title', 'dataset_title', 'ext_cleaned_label']]
pub_df['n_refs'] = pub_df['ext_cleaned_label'].apply(lambda x: len(x)).astype(int)
pub_df = pub_df.rename(columns={'ext_cleaned_label':'refs'})
pub_df.head()

"""Adding publications' texts from loaded json files"""
pub_df['raw_text'] = pd.Series(read_json_from_folder(os.path.join(data_path, 'train')))
pub_df['n_sections'] = pub_df['raw_text'].apply(lambda x: len(x)).astype(int)

""" decoding raw text to simple text """
pub_df['text'] = pub_df['raw_text'].apply(
    lambda x:'\n'.join([z for y in x for z in y.values()]))
pub_df['pub_len'] = pub_df.text.str.len()

print (pub_df.shape)

pub_df.sample(5)

In [None]:
del pub_df['raw_text']
gc.collect()

In [None]:
%%time 
""" classify train set sentences that contain references to datasets """
sentences = []

for row in tqdm(pub_df.itertuples(), total = pub_df.shape[0]):
    for sent in nltk.sent_tokenize(re.sub(r'\.?\n', '. ', row.text)):
        found_flag = False
        for r in row.refs:
            if r in sent.lower():
                sentences.append(
                    {'Id':row.Index, 'sentence':sent, 'ref':r, 'group':'train_1'})
                found_flag = True
        # save empty sencences,
        if not found_flag:
            if np.random.rand(1)[0] > 0.0:
                sentences.append(
                    {'Id':row.Index, 'sentence':sent,'group':'train_0'})

print("Num of sentences collected:", len(sentences))

In [None]:
%%time
""" create DF with sentences """
""" group feature indicates:
        - sentences with dataset titles ('train_1'),
        - sentences witout dataset titles ('train_1'),
        - sentences from publications without matches ('val') """
sent_df = pd.DataFrame(sentences)
# sent_df = sent_df.append(pd.DataFrame(sentences_empty)) # append empty sentences
sent_df = sent_df.reset_index(drop=True)  # reset after append
sent_df['Id'] = sent_df['Id'].astype('category')
sent_df['ref'] = sent_df['ref'].astype('category')
sent_df['group'] = sent_df['group'].astype('category')
sent_df['n_chars'] = sent_df.sentence.str.len()

# drop short empty sences (either chapter titles or tables components)
sent_df = sent_df.drop(sent_df[(sent_df.n_chars < 40) & (sent_df.ref.isna())].index)
print (sent_df.shape)

sent_df['group'].value_counts()

In [None]:
# 2 random and 2 positive examples
sent_df.sample(2).append(sent_df[sent_df.group == 'train_1'].sample(2))

In [None]:
""" DATASET CREATION
Due to the abundance of the negative examples, we limit their number to 3x the number of positive examples
This also saves time in model performance.
NB: Consider benefits of using all negative examples
"""
neg_multiple = 9  # multiplier to get number of negative examples
stopword_list = nltk.corpus.stopwords.words('english')

df = sent_df[sent_df.group == 'train_1']  # positive examples
df = df.append(sent_df[sent_df.group == 'train_0'].sample(df.shape[0] * neg_multiple))
# df = df.drop(columns = ['n_chars'])  # keep 'Id', 
df['clean'] = df.sentence.str.lower().replace(r"[^a-z ]+","", regex=True)
df['n_words'] = df.clean.apply(lambda x: len(x.split()))
print(df.shape)

df['group'].value_counts()

In [None]:
# split the referenced documents by ID
# This ensures that references from same publication are not present in both train and val sets

id_train, id_val = train_test_split(df.Id.unique(), test_size=0.1, random_state=42)
train_idx = df.reset_index()[df.Id.isin(id_train).values].index
val_idx = df.reset_index()[df.Id.isin(id_val).values].index

In [None]:
# While there are very long sentences (split defects?), most are under 50 0chars long 
print ("max number of characters in sentence:", df.n_chars.max())
df[df.n_chars < 1000].n_chars.hist(bins=20);
df.sample(3)

In [None]:
# Tokenize the sentences
%time df['tokenized'] = df.clean.progress_apply(lambda x: [ \
    w for w in nltk.word_tokenize(x[:500]) if w not in stopword_list])

In [None]:
unique_words = Counter()
for words in tqdm(df.tokenized.values):
    unique_words.update(words)
print (f"Unique words: {len(unique_words)}")    

In [None]:
""" assess opportunity to reduce vocab """
# count rare words
print("Percent of words in corpus by num of occurences")
print(pd.Series(unique_words.values()).value_counts().head(10)/len(unique_words), '\n')

# count words by length
print("Percent of words in corpus by length")
print(pd.Series(unique_words.keys(), name="words").str.len().value_counts().to_frame().reset_index().\
    sort_values(by='index').head(10).set_index('index')/len(unique_words))

In [None]:
# remove short and infrequent words
min_occurencies = 10
min_word_len = 3
my_vocab = {k:v for k, v in unique_words.items() if v>=min_occurencies and len(k)>= min_word_len}
my_vocab = {k: v for k, v in sorted(my_vocab.items(), key=lambda item: item[1], reverse=True)}
vocab_size = len(my_vocab)
print (f"Words to be used for regression: {vocab_size}") 

In [None]:
""" preparing feed for NN models"""

num_classes = 2

filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\'\n' + '0123456789'
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=25000, lower=True, 
                                                  filters=filters, oov_token='<OOV>')

# tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=len(list(my_vocab)))
tokenizer.fit_on_texts(my_vocab.keys())
# X_tokenized = tokenizer.texts_to_sequences(df.clean)
%time df['tokens_tf'] = tokenizer.texts_to_sequences(df.clean)
print (f"Input sentences tokenized with {tokenizer.get_config()['num_words']} words vocab")
y = ~df.ref.isna()

maxlen = 500
len_max = df.clean.str.len().max()
print (f"Max cleaned title length: {len_max}; limiting/padding sentences to {maxlen} words")

X_padded = tf.keras.preprocessing.sequence.pad_sequences(
    df.tokens_tf, maxlen=maxlen, padding='pre',)

X_train = X_padded[train_idx,:]
X_val = X_padded[val_idx,:]
y = 1 - df.ref.isna().astype(int)
y_train = y.iloc[train_idx]
y_val = y.iloc[val_idx]
print("Subsets shapes: ", X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
""" build transformer model"""

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model_t = keras.Model(inputs=inputs, outputs=outputs)
# model_t.summary()

In [None]:
model_t.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,
                patience=0, verbose=1, mode='auto', baseline=None, restore_best_weights=True)

In [None]:
history_t = model_t.fit(X_train, y_train, 
                  validation_data=(X_val, y_val),
                  epochs=1, batch_size=32, verbose=1, callbacks=callback)

In [None]:
history_t.history.values()

In [None]:
%%time
# save model weights
model_t.save_weights('./model/sent_transformer')

# Vocab and tf tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)   
with open('my_vocab.pickle', 'wb') as handle:
    pickle.dump(my_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Dataframe with all selected sentences
sent_df.to_pickle('sent_df.pickle')
pub_df.to_pickle('pub_df.pickle')

In [None]:
!ls * -lh

In [None]:
del sent_df
del pub_df
gc.collect()

In [None]:
# Import matching module
from fuzzywuzzy import fuzz

# prepare list of dataset titles to match
ds_titles = np.unique(np.concatenate(train_df['ext_cleaned_label'].values))
ds_titles.shape

In [None]:
test_data_path = '../input/coleridgeinitiative-show-us-the-data/test'
test_df = sample_sub.Id.to_frame().set_index('Id')
test_sentences = {}
candidate_threshold = 0.3
acceptance_score = 80

def read_json_pub(Id):
    filename = os.path.join(test_data_path, Id+'.json')
    with open(filename) as f:
        json_pub = json.load(f)
    return json_pub

for row in tqdm(test_df.itertuples(), total = test_df.shape[0]):
#     Load text
    raw_text = read_json_pub(row.Index)
    text = '\n'.join([z for y in raw_text for z in y.values()])

#     split and clean sentences
    sentences = nltk.sent_tokenize(re.sub(r'\.?\n', '. ', text))
    sentences = [re.sub(r"[^a-z ]+","", s.lower()) for s in sentences]
    
# tokenize
    tokens = tokenizer.texts_to_sequences(sentences)
    tokens = tf.keras.preprocessing.sequence.pad_sequences(
        tokens, maxlen=maxlen, padding='pre',)

# Predict candidates sentences that may contain DS references
    y_pred = model_t.predict(tokens, batch_size=32)
    sent_candidates = np.array(sentences)[y_pred[:,1] > candidate_threshold]
    test_sentences[row.Index] = sent_candidates

#  process candidate sentences for given pub
    ds_candidates = set()
    for sent in sent_candidates:
        scores = [fuzz.partial_ratio(sent, title) for title in ds_titles]
        best_fit_title_index = np.argmax(scores)
        if max(scores) > acceptance_score:
            ds_candidates.add(ds_titles[np.argmax(scores)])
    prediction_string = ' | '.join(ds_candidates)
#     print (prediction_string)
    test_df.loc[row.Index, 'PredictionString'] = prediction_string

In [None]:
test_df.head()

In [None]:
test_df[['PredictionString']].to_csv('submission.csv')