<h1 style='background:teal; color:white; padding:20px;'>
Coleridge: Sentence analysis</h1>


**Identifying sentences containinig dataset titles**

version history:
- v.1-2 sentence split, Naive NBayes model baseline
- v.3 - added Transformer model for sentence identification
- v.4,5 - code cleanup. Added prediction for test set with simple string matching.
- v.6 - consecutive processing of test pubs to save RAM
- v.7 - removed Naive Bayes model, improved positive match detection

In [None]:
import os
import re
import json
import pickle
from collections import defaultdict, Counter
import gc

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
tqdm.pandas()

%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # full screen width of Jupyter notebook
pd.options.display.max_rows, pd.options.display.max_columns = 500, 100

# NLP imports
import nltk

# Neural network imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print( 'tf version:', tf.__version__)

In [None]:
import multiprocessing as mp
import pandas.util.testing as pdt
mp.cpu_count()

In [None]:
%%time
""" Loading data"""
data_path = '../input/coleridgeinitiative-show-us-the-data/'

def read_json_from_folder(folder_name):
    json_dict = {}
    for filename in os.listdir(folder_name):
        with open(os.path.join(folder_name, filename)) as f:
            json_dict[filename[:-5]] = json.load(f)
    return json_dict

# train_dict = read_json_from_folder(os.path.join(data_path, 'train'))
# test_dict = read_json_from_folder(os.path.join(data_path, 'test'))
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
sample_sub = pd.read_csv(os.path.join(data_path,'sample_submission.csv'))
    
# len(train_dict), len(test_dict), 
train_df.shape, sample_sub.shape

# Publications Dataframe creation `pubs_df`

In [None]:
""" build PUB_DF by repacking data from train_df """
pub_df = train_df.groupby('Id').agg({'pub_title':'first', 'dataset_label':'nunique',
                                    'cleaned_label':lambda x: sorted(x)})
pub_df = pub_df.rename(columns = {'dataset_label':'n_refs', 'cleaned_label':'refs'})
# note that `refs` now contains sorted ground truth references

In [None]:
"""Adding publications' texts from loaded json files"""
pub_df['raw_text'] = pd.Series(read_json_from_folder(os.path.join(data_path, 'train')))
pub_df['n_sections'] = pub_df['raw_text'].apply(lambda x: len(x)).astype(int)

""" decoding raw text to simple text """
pub_df['text'] = pub_df['raw_text'].apply(
    lambda x:'\n'.join([z for y in x for z in y.values()]))
pub_df['pub_len'] = pub_df.text.str.len()

print (pub_df.shape)

In [None]:
""" find and count exact text matches in train_df and pub_df """
for row in tqdm(train_df.itertuples(), total = train_df.shape[0]):
    text = pub_df.loc[row.Id, 'text'].lower()
#     print (i, len(text), row.dataset_title in text)
    train_df.loc[row.Index, 'found'] = row.dataset_label.lower() in text  # same result with lower
train_df.found = train_df.found.astype(int)
print (f"Confirmed {(train_df.found != 0).sum()} matches.")
print (f"Failed to confirm {(train_df.found != 1).sum()} matches. Examples: {train_df[train_df.found == 0].index[:5].values.tolist()}")

# Add number of datasets, whose names were found in each publication
pub_df['ref_matched'] = train_df.groupby('Id').agg({'found':'sum'})

pub_df.sample(3)

In [None]:
pub_df.sample(3)

# Sentence dataframe creation

In [None]:
# prepare list of dataset titles to match
ds_titles = set().union(set(train_df.cleaned_label.unique()),
                        set(train_df.dataset_label.str.lower().unique()),
                        set(train_df.dataset_title.str.lower().unique()),  )
ds_titles = np.array(list(ds_titles))
ds_titles.shape

In [None]:
dataset_df = train_df.groupby('dataset_title').agg({'dataset_label':'unique', 'cleaned_label':'unique'}).reset_index()

for row in dataset_df.itertuples():
    labels = [*row.dataset_label.tolist(), *row.cleaned_label.tolist()]
    labels = set([lab.lower() for lab in labels])
    dataset_df.loc[row.Index, 'labels'] = '|'.join(labels)

dataset_df.labels = dataset_df.labels.apply(lambda x: x.split('|'))
dataset_df.sample(5)

In [None]:
%%time
""" classify train set sentences that contain references to datasets """
# n = 1000
sentences = []
for row in tqdm(pub_df.itertuples(), total = pub_df.shape[0]):
    clean_text = re.sub(r"[^a-z \.\n]+","", row.text.lower())
    titles_shortlist = [t for t in ds_titles if t in clean_text]  # shortlist saves time

#     capture similar ds title variants
    expanded_refs = []
    for title in row.refs:
        for ds_row in dataset_df.itertuples():
            if title in ds_row.labels:
                expanded_refs.extend(ds_row.labels)
        expanded_refs = list(set(row.refs + expanded_refs))
    
    sentences_list = nltk.sent_tokenize(re.sub(r'\.?\n', '. ', row.text))
    sentences_list = [re.sub(r"[^a-z \.]+","", s.lower()) for s in sentences_list]
    for sent in sentences_list:
        found_flag = False
        for title in titles_shortlist:
            if title in sent:
                found_flag = True
                group = 'TP' if title in expanded_refs else 'FP'
                sentences.append({'Id':row.Index,'sentence':sent,'match':title,'group':group})
        if not found_flag:
            group = 'N' if row.n_refs == row.ref_matched else 'UNK'  
            sentences.append({'Id':row.Index,'sentence':sent,'match':None,'group':group})

sent_df = pd.DataFrame(sentences)
print(sent_df.shape)
sent_df.group.value_counts()
# was 9631 FPs

In [None]:
sent_df['n_chars'] = sent_df.sentence.str.len()

In [None]:
sent_df[sent_df.group=='FP'].sample(10)

In [None]:
# sample false positive
id = '84f2ca4e-2d1f-40f5-857b-7d41fe724645'
text = pub_df.loc[id, 'text']
clean_text = re.sub(r"[^a-z \n\.]+","", text.lower())
pub_df.loc[id, 'refs']
# mm in ds_titles

In [None]:
# 2 random and 2 positive examples
sent_df.sample(2).append(sent_df[sent_df.group == 'TP'].sample(2))

# MODELS FOR PREDICTION OF SENTENCES WITH DS TITLES

### Dataset creation

In [None]:
""" DATASET CREATION
Due to the abundance of the negative examples, we limit their number to 3x the number of positive examples
This also saves time in model performance.
NB: Consider benefits of using all negative examples
"""
neg_multiple = 9  # multiplier to get number of negative examples
stopword_list = nltk.corpus.stopwords.words('english')

df = sent_df[sent_df.group == 'TP']  # positive examples
df = df.append(sent_df[sent_df.group == 'N'].sample(df.shape[0] * neg_multiple))
# df = df.drop(columns = ['n_chars'])  # keep 'Id', 
df['clean'] = df.sentence.str.lower().replace(r"[^a-z ]+","", regex=True)
df['n_words'] = df.clean.apply(lambda x: len(x.split()))
df.shape

In [None]:
# split the referenced documents by ID
# This ensures that references from same publication are not present in both train and val sets

id_train, id_val = train_test_split(df.Id.unique(), test_size=0.1, random_state=42)
train_idx = df.reset_index()[df.Id.isin(id_train).values].index
val_idx = df.reset_index()[df.Id.isin(id_val).values].index

In [None]:
# While there are very long sentences (split defects?), most are under 50 0chars long 
print ("max number of characters in sentence:", df.n_chars.max())
df[df.n_chars < 1000].n_chars.hist(bins=20);

In [None]:
df.sample(3)

## Build vocab

In [None]:
# Tokenize the sentences
%time df['tokenized'] = df.clean.progress_apply(lambda x: [ \
    w for w in nltk.word_tokenize(x[:500]) if w not in stopword_list])

In [None]:
unique_words = Counter()
for words in tqdm(df.tokenized.values):
    unique_words.update(words)
print (f"Unique words: {len(unique_words)}")    

In [None]:
""" assess opportunity to reduce vocab """
# count rare words
print("Percent of words in corpus by num of occurences")
print(pd.Series(unique_words.values()).value_counts().head(10)/len(unique_words), '\n')

# count words by length
print("Percent of words in corpus by length")
print(pd.Series(unique_words.keys(), name="words").str.len().value_counts().to_frame().reset_index().\
    sort_values(by='index').head(10).set_index('index')/len(unique_words))

In [None]:
# remove short and infrequent words
min_occurencies = 10
min_word_len = 3
my_vocab = {k:v for k, v in unique_words.items() if v>=min_occurencies and len(k)>= min_word_len}
my_vocab = {k: v for k, v in sorted(my_vocab.items(), key=lambda item: item[1], reverse=True)}
vocab_size = len(my_vocab)
print (f"Words to be used for regression: {vocab_size}") 

# Naïve Bayes model

In [None]:
# create data for training and validation
nb_X_train = df.iloc[train_idx].tokenized
nb_X_val = df.iloc[val_idx].tokenized

nb_y_train = (df[(df.Id.isin(id_train))].group=='TP').astype('int')
nb_y_val = (df[(df.Id.isin(id_val))].group=='TP').astype('int')

# prepare the vectorizer
vectorizer = CountVectorizer(vocabulary = list(my_vocab.keys()))
%time vectorizer = vectorizer.fit(df.tokenized.apply(lambda x: ' '.join(x)).values)

# vectorize
nb_X_train = vectorizer.transform(nb_X_train.apply(lambda x: ' '.join(x)).values)
nb_X_val = vectorizer.transform(nb_X_val.apply(lambda x: ' '.join(x)).values)

print(nb_X_train.shape, nb_X_val.shape, nb_y_train.shape, nb_y_val.shape)
nb_X_train.shape, nb_X_val.shape, nb_y_train.shape, nb_y_val.shape

In [None]:
# run Naïve Bayes model
nb_model = MultinomialNB().fit(nb_X_train, nb_y_train)
nb_y_pred = nb_model.predict(nb_X_val)
nb_score = accuracy_score(nb_y_val, nb_y_pred)
print (f"Naïve Bayes baseline accuracy score: {nb_score:.4}")

plot_confusion_matrix (nb_model, nb_X_val, nb_y_val);
plt.title(f"Confusion Matrix. Accuracy = {nb_score*100:.2f}", fontsize=16);

# Transformer model
LSTM would be faster but shows somewhat worse performance

In [None]:
""" preparing feed for NN models"""

num_classes = 2

filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\'\n' + '0123456789'
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=25000, lower=True, 
                                                  filters=filters, oov_token='<OOV>')

# tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=len(list(my_vocab)))
tokenizer.fit_on_texts(my_vocab.keys())
# X_tokenized = tokenizer.texts_to_sequences(df.clean)
%time df['tokens_tf'] = tokenizer.texts_to_sequences(df.clean)
print (f"Input sentences tokenized with {tokenizer.get_config()['num_words']} words vocab")
# y = df.group == 'TP'

maxlen = 500
len_max = df.clean.str.len().max()
print (f"Max cleaned title length: {len_max}; limiting/padding sentences to {maxlen} words")

X_padded = tf.keras.preprocessing.sequence.pad_sequences(
    df.tokens_tf, maxlen=maxlen, padding='pre',)

X_train = X_padded[train_idx,:]
X_val = X_padded[val_idx,:]
y =(df.group == 'TP').astype(int)
y_train = y.iloc[train_idx]
y_val = y.iloc[val_idx]
print("Subsets shapes: ", X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
""" build transformer model"""

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model_t = keras.Model(inputs=inputs, outputs=outputs)
# model_t.summary()

In [None]:
model_t.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,
                patience=0, verbose=1, mode='auto', baseline=None, restore_best_weights=True)

The model will be trained for 1 epoch only  - this already gives close to 99% accuracy and saves time. 2d epoch will overfit already.

There may be some potential in hyperparameters optimization. 

GPU accelerates training by factor of over 10.

In [None]:
history_t = model_t.fit(X_train, y_train, 
                  validation_data=(X_val, y_val),
                  epochs=1, batch_size=32, verbose=1, callbacks=callback)

In [None]:
history_t.history.values()

## Analyze pubs with 0 direct matches

a few publications were not with the dataset names by string search. Let's see if our models can point to the relevant sentences which contain dataset names.

In [None]:
# preparing dataset with pubs with zero refenrece matches
pubs_0 = pub_df[pub_df.ref_matched == 0]
print(f"There are {pubs_0.shape[0]} pubs with zero dataset title matches")
pubs_0.sample(1)

In [None]:
"""Preparing dataset with sentences """
sent_0 = sent_df[sent_df.group=='UNK']
sent_0 = sent_0.drop(sent_0[(sent_0.n_chars < 40)].index)  # drop too short sentences
sent_0 = sent_0.drop(sent_0[(sent_0.n_chars > 500)].index)  # drop too long sentences
sent_0['clean'] = sent_0.sentence.str.lower().replace(r"[^a-z ]+","", regex=True)
print(sent_0.shape)
sent_0.sample(3)

### Test with Naïve Bayes model

In [None]:
%time sent_0['tokenized'] = sent_0.clean.progress_apply(lambda x: [ \
    w for w in nltk.word_tokenize(x) if w not in stopword_list])

%time sent_0_X = vectorizer.transform(sent_0.tokenized.apply(lambda x: ' '.join(x)).values)
sent_0['pred_nb'] = nb_model.predict(sent_0_X).astype(int)

# count sentences with possible title
print (f"candidates found with Naïve Bayes: {sent_0.pred_nb.sum()}")

### Test with transformers model

In [None]:
sent_0['tokens_tf'] = tokenizer.texts_to_sequences(sent_0.clean)
X_padded_0 = tf.keras.preprocessing.sequence.pad_sequences(
    sent_0.tokens_tf, maxlen=maxlen, padding='pre',)

In [None]:
%%time 
sent_0['tokens_tf'] = tokenizer.texts_to_sequences(sent_0.clean)
X_padded_0 = tf.keras.preprocessing.sequence.pad_sequences(
    sent_0.tokens_tf, maxlen=maxlen, padding='pre',)
y_pred_logits = model_t.predict(X_padded_0, batch_size=32)
y_pred = y_pred_logits.argmax(axis=1)
sent_0['pred_tf'] = y_pred
print (f"identified {y_pred.sum()} candidate sentences with possible dataset titles")

In [None]:
pubs_0.head()

In [None]:
""" show selected sentences """
for row in sent_0[sent_0.pred_tf==1].sample(5).itertuples():
    print("ID:  ", row.Id)
    print("Sentence:  ", row.sentence)
    print("Ground truth:  ", pub_df.loc[row.Id, 'refs'])
    print()

In [None]:
# there is  number of sentences with logit values between 0.1 and 0.5, 
print("Num candidates with logit > 0.5: ",  y_pred_logits[y_pred_logits[:,1]>0.5].shape[0])
print("Num candidates with logit > 0.1: ",  y_pred_logits[y_pred_logits[:,1]>0.1].shape[0])
sns.histplot(y_pred_logits[:,1][y_pred_logits[:,1]>0.1], bins = 9);
sns.histplot(y_pred_logits[:,1][y_pred_logits[:,1]>0.5], bins = 5, color='g', alpha = 1);

## Saving interim results

In [None]:
%%time
# save model weights
model_t.save_weights('./model/sent_transformer')

# Vocab and tf tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)   
with open('my_vocab.pickle', 'wb') as handle:
    pickle.dump(my_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # collected sentences
# with open('sentences.pickle', 'wb') as handle:
#     pickle.dump(sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)   
# len(sentences)

# Dataframe with all selected sentences
sent_df.to_pickle('sent_df.pickle')
pub_df.to_pickle('pub_df.pickle')

In [None]:
!ls * -lh

# predict test for submission

In [None]:
del sent_df
del pub_df
gc.collect()

In [None]:
# Import matching module
from fuzzywuzzy import fuzz

# prepare list of dataset titles to match
ds_titles = train_df.cleaned_label.unique()
ds_titles.shape

In [None]:
test_data_path = '../input/coleridgeinitiative-show-us-the-data/test'
test_df = sample_sub.Id.to_frame().set_index('Id')
test_sentences = {}
candidate_threshold = 0.3
acceptance_score = 80

def read_json_pub(Id):
    filename = os.path.join(test_data_path, Id+'.json')
    with open(filename) as f:
        json_pub = json.load(f)
    return json_pub

for row in tqdm(test_df.itertuples(), total = test_df.shape[0]):
#     Load text
    raw_text = read_json_pub(row.Index)
    text = '\n'.join([z for y in raw_text for z in y.values()])

#     split and clean sentences
    sentences = nltk.sent_tokenize(re.sub(r'\.?\n', '. ', text))
    sentences = [re.sub(r"[^a-z ]+","", s.lower()) for s in sentences]
    
# tokenize
    tokens = tokenizer.texts_to_sequences(sentences)
    tokens = tf.keras.preprocessing.sequence.pad_sequences(
        tokens, maxlen=maxlen, padding='pre',)

# Predict candidates sentences that may contain DS references
    y_pred = model_t.predict(tokens, batch_size=32)
    sent_candidates = np.array(sentences)[y_pred[:,1] > candidate_threshold]
    test_sentences[row.Index] = sent_candidates
#     print (row.Index, len(candidates))
    
#  process candidate sentences for given pub
    ds_candidates = set()
    for sent in sent_candidates:
        scores = [fuzz.partial_ratio(sent, title) for title in ds_titles]
        best_fit_title_index = np.argmax(scores)
        if max(scores) > acceptance_score:
            ds_candidates.add(ds_titles[np.argmax(scores)])
    prediction_string = ' | '.join(ds_candidates)
#     print (prediction_string)
    test_df.loc[row.Index, 'PredictionString'] = prediction_string

In [None]:
test_df.head()

In [None]:
test_df[['PredictionString']].to_csv('submission.csv')

In [None]:
# !cat submission.csv

# Takeaways:

**Immediate results**
- generated set of sentences containing dataset titles.
- established baseline of 93% accuracy with Naïve Bayes model.
- NEW in v.3: achieved close to 99% acccuracy with Transformer model
- NEW in v.5: atttempt to submit
- NEW in v.6: test set processed consecutively to save RAM

**Improvement opportunities**
- improve sentence detection / splitting 
- test improvement with stemming
- test potential improvement of sentence selection with models based on neural networks.

**Further use:**
- feed selected sentences to NER models for identification of the dataset titles

**Files for reuse:**
- `sentences_with_refs.pickle`, `sentences_empty.pickle` -- dicts with collected sentences
- `sent_df.pickle` -- dataframe with sentences
- `model folder` -- model weights

TBD:

Multiprocess with SpaCy (thx to @lucabasa): https://stackoverflow.com/questions/44395656/applying-spacy-parser-to-pandas-dataframe-w-multiprocessing