# NLP Contradictory - LASER Embeddings + Keras

This notebook is a port of my [NLP Disaster Tweets - LASER Embeddings + Keras](https://www.kaggle.com/jamesmcguigan/nlp-laser-embeddings-keras) notebook, which encodes the tweets using [LASER](https://github.com/yannvgn/laserembeddings) multilingual sentence embeddings,
followed by a [TF Keras](https://www.tensorflow.org/api_docs/python/tf/keras) dense neural network.

In [None]:
!pip install -q laserembeddings laserembeddings[zh] laserembeddings[ja]
!pip install -q ftfy

In [None]:
# import fasttext
import ftfy
import html
import laserembeddings
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import sys

from fastcache import clru_cache
from laserembeddings import Laser
from typing import List, Union
from urllib.parse import unquote
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_train = pd.read_csv('../input/contradictory-my-dear-watson/train.csv', index_col=0).fillna('')
df_test  = pd.read_csv('../input/contradictory-my-dear-watson/test.csv',  index_col=0).fillna('')
df_train

# LASER Embeddings

This encodes each of the strings as a LASER embedding (1024 dimentional vector)

In [None]:
%%bash
# DOCS: https://github.com/facebookresearch/LASER/blob/master/install_models.sh

mkdir -p models/laser/
# for FILE in bilstm.eparl21.2018-11-19.pt eparl21.fcodes eparl21.fvocab bilstm.93langs.2018-12-26.pt 93langs.fcodes 93langs.fvocab; do
for FILE in bilstm.93langs.2018-12-26.pt 93langs.fcodes 93langs.fvocab; do
    wget -cq https://dl.fbaipublicfiles.com/laser/models/$FILE -O models/laser/$FILE
done

In [None]:
# from config import config
# from src.utils.fasttest_model import language_detect
# from src.utils.punkt_tokenizer import punkt_tokenize_sentences

config = {
    "laser": {
        "base_dir":  "./models/laser",
        "bpe_codes": "./models/laser/93langs.fcodes",
        "bpe_vocab": "./models/laser/93langs.fvocab",
        "encoder":   "./models/laser/bilstm.93langs.2018-12-26.pt",
    }
}

# Instantiate encoder
# BUG: CUDA GPU memory is exceeded if both laser and labse are loaded together 
@clru_cache(None)
def get_laser_model():
    laser_model = Laser(
        bpe_codes = config['laser']['bpe_codes'],
        bpe_vocab = config['laser']['bpe_vocab'],
        encoder   = config['laser']['encoder'],
        tokenizer_options = None,
        embedding_options = None
    )
    return laser_model


def laser_encode(text: Union[str, List[str]], lang='en', normalize=True) -> np.ndarray:
    """
    Encodes a corpus of text using LASER
    :param text: Large block of text (will be tokenized), or list of pre-tokenized sentences
    :param lang: 2 digit language code (optional autodetect)
    :return:     embedding matrix
    """
    laser_model = get_laser_model()
    
    # lang = lang or language_detect(text, threshold=0.0)
    if isinstance(text, str):
        # sentences = punkt_tokenize_sentences(text, lang=lang)
        sentences = [ text ]
    else:
        sentences = list(text)

    embedding = laser_model.embed_sentences(sentences, lang=lang)
    
    if normalize:
        embedding = embedding / np.sqrt(np.sum(embedding**2, axis=1)).reshape(-1,1)
        
    return embedding

In [None]:
def encode_X(df):
    premise    = laser_encode(df['premise'],    lang=df['lang_abv'])
    hypothesis = laser_encode(df['hypothesis'], lang=df['lang_abv'])
    cosine = np.array([
        cosine_similarity( premise[n].reshape(1,-1), hypothesis[n].reshape(1,-1) )
        for n in range(len(df))
    ]).reshape(-1,1)
    X = np.array([
        np.concatenate([ premise[n], hypothesis[n], cosine[n] ])
        for n in range(len(df))
    ])

    # print('premise.shape    ', premise.shape)     # (12120, 1024)
    # print('hypothesis.shape ', hypothesis.shape)  # (12120, 1024)
    # print('cosine.shape     ', cosine.shape)      # (12120, 1)
    # print('X.shape          ', X.shape)           # (12120, 2049)
    return X
    

def encode_Y(df):
    encoder = OneHotEncoder().fit([ [0], [1], [2] ])
    return encoder.transform( df['label'].to_numpy().reshape(-1,1) ).toarray()

def decode_Y(one_hot_encoded):
    decoded = tf.argmax(one_hot_encoded, axis=1)
    return decoded.numpy().astype(np.int32)

In [None]:
%%time
X_train = encode_X(df_train)
Y_train = encode_Y(df_train)

# Neural Network - TF Keras

Define and train a dense neural network. 

This inputs a 1024 LASER embedding and outputs a 1 bit classification prediction.

A triangular shaped architecture is used, including Dropout and BatchNorm.

In [None]:
# DOCS: https://keras.io/examples/keras_recipes/antirectifier/

def model_compile_fit(
    X, Y, 
    model      = None,
    test_size  = 0.2,
    epochs     = 1000, 
    batch_size = 32, 
    verbose    = 2,
):
    # Build the model
    if model is None:
        model = tf.keras.Sequential([
            tf.keras.Input(shape=(X.shape[1],)),
            tf.keras.layers.Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(8, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(Y.shape[1], activation=tf.keras.activations.sigmoid),
        ])
        model.summary()
            
    # Compile the model
    model.compile(
        loss      = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5),
        metrics   = [ tf.keras.metrics.CategoricalAccuracy() ],
    )
    
    # Train the model
    if test_size:
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size)
        
        model.fit(
            X_train, Y_train, 
            batch_size = batch_size, 
            epochs     = epochs, 
            validation_split = test_size,
            callbacks = [
                tf.keras.callbacks.EarlyStopping(
                    monitor  = 'val_loss', 
                    mode     = 'min', 
                    verbose  = 0, 
                    patience = 100
                ),
                tf.keras.callbacks.ModelCheckpoint(
                    'model.h5', 
                    monitor = 'val_categorical_accuracy', 
                    mode    = 'max', 
                    verbose = 0, 
                    save_best_only = True
                )
            ],
            verbose = verbose
        )
    else:
        X_train, Y_train = X, Y

        model.fit(
            X, Y, 
            batch_size = batch_size, 
            epochs     = epochs, 
            verbose    = verbose,
        )
        
        
    print()
    print('Train Accuracy')
    model.evaluate(X_train, Y_train)

    if test_size:
        print('Test Accuracy')
        model.evaluate(X_test, Y_test)

    return model

In [None]:
%%time
model = model_compile_fit(X_train, Y_train, test_size=0.2)
model = model_compile_fit(X_train, Y_train, test_size=0, epochs=1000, model=model, verbose=0)
model.save('model.h5')

# Submission

In [None]:
%%time 

X_test = encode_X( df_test)
Y_test = decode_Y( model.predict(X_test) )

In [None]:
df_submission = pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv', index_col=0).fillna('')
df_submission['prediction'] = Y_test
df_submission.to_csv('submission.csv')
!head submission.csv

# Further Reading

This notebook is part of a series exploring Natural Language Processing

NLP Disaster Tweets
- 0.74164 - [NLP Logistic Regression](https://www.kaggle.com/jamesmcguigan/disaster-tweets-logistic-regression)
- 0.77536 - [NLP TF-IDF Classifier](https://www.kaggle.com/jamesmcguigan/disaster-tweets-tf-idf-classifier)
- 0.78302 - [NLP LASER Embeddings + Keras](https://www.kaggle.com/jamesmcguigan/nlp-laser-embeddings-keras)
- 0.79742 - [NLP Naive Bayes](https://www.kaggle.com/jamesmcguigan/nlp-naive-bayes)

Contradictory, My Dear Watson
- 0.50779 - [NLP Contradictory - LASER Embeddings + Keras](https://www.kaggle.com/jamesmcguigan/nlp-contradictory-laser-embeddings-keras)