# Goal: Intent classification with Universal Sentence Encoder

The beauty and magic of USE is that it takes care of the text cleaning, tokenization and embedding for you, with amazing results. 

In this nb, I test USE on ATIS - Airline Travel Info System, with a small unbalanced dataset. 
The task is to predict the user intent - one out 8 mutually exclusive classes.


The USE part was borrowed from https://www.kaggle.com/xhlulu/disaster-nlp-train-a-universal-sentence-encoder

In [None]:
import os
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
tqdm.pandas()

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Dense, Input, Flatten, BatchNormalization, Dropout, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint



# Data

In [None]:
train = pd.read_csv('../input/atis-airlinetravelinformationsystem/atis_intents_train.csv')
train.columns = ['intent', 'snippet']

print(train.shape)
train.head()

In [None]:
train.intent.value_counts(), train.intent.value_counts(normalize=True)

In [None]:
test = pd.read_csv('../input/atis-airlinetravelinformationsystem/atis_intents_test.csv')
test.columns = ['intent', 'snippet']

print(test.shape)
test.head()

In [None]:
test.intent.value_counts(), test.intent.value_counts(normalize=True)

Unbalanced datasets, with majority voting / guessing accuracy = 0.79

In [None]:
train_data = train.snippet.values
train_labels = train.intent.values
test_data = test.snippet.values
test_labels = test.intent.values

len(train_data), len(train_labels), len(test_data), len(test_labels)

In [None]:
print(train_data[123])
print(train_labels[123])

### OHE the labels

In [None]:
y_train = pd.get_dummies(train_labels)
print(y_train.shape)
y_train.head()

In [None]:
y_test = pd.get_dummies(test_labels)
print(y_test.shape)
y_test.head()

# Model - Universal Sentence Encoder

In [None]:
%%time
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/4'
embed = hub.KerasLayer(module_url, trainable=True, name='USE_embedding')

# I've checked what happens if trainable=False ... bad performance 0.79, picking the majority group / guessing option

In [None]:
def build_model(embed):
    
    model = Sequential([
        Input(shape=[], dtype=tf.string),
        embed,
        Dense(8, activation='softmax')
    ])
    model.compile(Adam(1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

model = build_model(embed)
model.summary()

In [None]:
checkpoint = ModelCheckpoint('modelATIS.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_data, y_train,
    validation_split=0.20,
    epochs = 10,
    callbacks=[checkpoint],
    batch_size=32
)

In [None]:
# VALIDATION LOSS curves

plt.clf()
history_dict = train_history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, (len(history_dict['loss']) + 1))
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# VALIDATION ACCURACY curves

plt.clf()
acc_values = history_dict['accuracy']
val_acc_values = history_dict['val_accuracy']
epochs = range(1, (len(history_dict['accuracy']) + 1))
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
rawPreds = model.predict(test_data)
rawPreds.shape

In [None]:
Preds = []

for j in range(rawPreds.shape[0]):
    pos = rawPreds[j].argmax()
    Preds.append(y_test.columns[pos])
    
len(Preds)
    

In [None]:
print(classification_report(Preds, test_labels))