# Laser-Embeddings Approach
By: Haasitha Pidaparthi

Reference: 
- https://github.com/facebookresearch/LASER
- https://engineering.fb.com/2019/01/22/ai-research/laser-multilingual-sentence-embeddings/
- https://github.com/yannvgn/laserembeddings/tree/ceb3818c998099d315a935210d3962640922fa8b
- https://pypi.org/project/laserembeddings/
- https://www.kaggle.com/kiatweitan/contradictory-oh-my-dear-watson-laser-embedding
- https://www.kaggle.com/camnugent/faa-laser-days-of-the-week-hypothesis-test/comments#214360

## 1. Import libraries and datasets

In [None]:
!pip install laserembeddings

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# modeling
import laserembeddings
from laserembeddings import Laser
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

from fastcache import clru_cache
from typing import List, Union

In [None]:
df_train = pd.read_csv('../input/contradictory-my-dear-watson/train.csv', index_col=0).fillna('')
df_test  = pd.read_csv('../input/contradictory-my-dear-watson/test.csv',  index_col=0).fillna('')
df_train

## 2. LASER Embeddings


In [None]:
%%bash
mkdir -p models/laser/
for FILE in bilstm.93langs.2018-12-26.pt 93langs.fcodes 93langs.fvocab; do
    wget -cq https://dl.fbaipublicfiles.com/laser/models/$FILE -O models/laser/$FILE
done

In [None]:
config = {
    "laser": {
        "base_dir":  "./models/laser",
        "bpe_codes": "./models/laser/93langs.fcodes",
        "bpe_vocab": "./models/laser/93langs.fvocab",
        "encoder":   "./models/laser/bilstm.93langs.2018-12-26.pt",
    }
}

@clru_cache(None)
def get_laser_model():
    laser_model = Laser(
        bpe_codes = config['laser']['bpe_codes'],
        bpe_vocab = config['laser']['bpe_vocab'],
        encoder = config['laser']['encoder'],
        tokenizer_options = None,
        embedding_options = None
    )
    return laser_model

# Template for encoding text
def laser_encode(text: Union[str, List[str]], lang='en', normalize=True) -> np.ndarray:
    laser_model = get_laser_model()
    
    if isinstance(text, str):
        sentences = [ text ]
    else:
        sentences = list(text)

    embedding = laser_model.embed_sentences(sentences, lang=lang)
    if normalize:
        embedding = embedding / np.sqrt(np.sum(embedding**2, axis=1)).reshape(-1,1)
        
    return embedding

In [None]:
def encode_X(df):
    premise    = laser_encode(df['premise'],    lang=df['lang_abv'])
    hypothesis = laser_encode(df['hypothesis'], lang=df['lang_abv'])
    cosine = np.array([
        cosine_similarity( premise[n].reshape(1,-1), hypothesis[n].reshape(1,-1) )
        for n in range(len(df))
    ]).reshape(-1,1)
    X = np.array([
        np.concatenate([ premise[n], hypothesis[n], cosine[n] ])
        for n in range(len(df))
    ])
    return X
    
def encode_Y(df):
    encoder = OneHotEncoder().fit([ [0], [1], [2] ])
    return encoder.transform( df['label'].to_numpy().reshape(-1,1) ).toarray()

def decode_Y(one_hot_encoded):
    decoded = tf.argmax(one_hot_encoded, axis=1)
    return decoded.numpy().astype(np.int32)

In [None]:
%%time
X_train = encode_X(df_train)
Y_train = encode_Y(df_train)

## 3. Neural Network

In [None]:
# Parameters
test_size = 0.2
epochs = 100
batch_size = 32
verbose = 2

In [None]:
# split data into training and testing: 80 to 20
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=test_size, randon_state=42)

In [None]:
model = tf.keras.Sequential([
            tf.keras.Input(shape=(X_train.shape[1],)),
            tf.keras.layers.Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(8, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Dense(Y_train.shape[1], activation=tf.keras.activations.sigmoid),
        ])
model.summary()

In [None]:
model.compile(
        loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5),
        metrics = [ tf.keras.metrics.CategoricalAccuracy() ],
    )

In [None]:
%%time
history = model.fit(
            X_train, Y_train, 
            batch_size = batch_size, 
            epochs     = epochs, 
            validation_split = test_size,
            callbacks = [
                tf.keras.callbacks.EarlyStopping(
                    monitor  = 'val_loss', 
                    mode     = 'min', 
                    verbose  = 0, 
                    patience = 100
                ),
                tf.keras.callbacks.ModelCheckpoint(
                    'model.h5', 
                    monitor = 'val_categorical_accuracy', 
                    mode    = 'max', 
                    verbose = 0, 
                    save_best_only = True
                )
            ],
            verbose = verbose
        )

In [None]:
print()
print('Train Accuracy')
model.evaluate(X_train, Y_train)

print('Test Accuracy')
model.evaluate(X_test, Y_test)

## 4. Visualize Model Training

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## 5. Submission

In [None]:
%%time 
X_test = encode_X( df_test)
Y_test = decode_Y( model.predict(X_test) )

In [None]:
df_submission = pd.read_csv('../input/contradictory-my-dear-watson/sample_submission.csv', index_col=0).fillna('')
df_submission['prediction'] = Y_test
df_submission.to_csv('submission.csv')
!head submission.csv