## Exploration


In [None]:
import pandas as pd

df = pd.read_csv('atis_intents.csv', names= ["label", "query"])
training_df= pd.read_csv('atis_intents_train.csv', names= ["label", "query"])
test_df= pd.read_csv('atis_intents_test.csv', names= ["label", "query"])
df.head(10)

Unnamed: 0,label,query
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...
5,atis_flight,i need a flight tomorrow from columbus to min...
6,atis_aircraft,what kind of aircraft is used on a flight fro...
7,atis_flight,show me the flights from pittsburgh to los an...
8,atis_flight,all flights from boston to washington
9,atis_ground_service,what kind of ground transportation is availab...


In [None]:
print("There are {} rows and {} columns".format(df.shape[0], df.shape[1]))

There are 4977 rows and 2 columns


In [None]:
print(df.label.unique())

['atis_flight' 'atis_flight_time' 'atis_airfare' 'atis_aircraft'
 'atis_ground_service' 'atis_airport' 'atis_airline' 'atis_distance'
 'atis_abbreviation' 'atis_ground_fare' 'atis_quantity' 'atis_city'
 'atis_flight_no' 'atis_capacity' 'atis_flight#atis_airfare' 'atis_meal'
 'atis_restriction' 'atis_airline#atis_flight_no'
 'atis_ground_service#atis_ground_fare' 'atis_airfare#atis_flight_time'
 'atis_cheapest' 'atis_aircraft#atis_flight#atis_flight_no']


In [None]:
df = df[df["label"].str.contains("#")==False]
df.label.value_counts()

atis_flight            3665
atis_airfare            423
atis_ground_service     255
atis_airline            157
atis_abbreviation       147
atis_aircraft            81
atis_flight_time         54
atis_quantity            51
atis_airport             20
atis_distance            20
atis_city                19
atis_ground_fare         18
atis_capacity            16
atis_flight_no           12
atis_meal                 6
atis_restriction          6
atis_cheapest             1
Name: label, dtype: int64

In [None]:
test_df.head()

Unnamed: 0,label,query
0,atis_airfare,on april first i need a ticket from tacoma to...
1,atis_flight,on april first i need a flight going from pho...
2,atis_flight,i would like a flight traveling one way from ...
3,atis_flight,i would like a flight from orlando to salt la...
4,atis_flight,i need a flight from toronto to newark one wa...


In [None]:
print("There are {} rows and {} columns".format(test_df.shape[0], test_df.shape[1]))

There are 799 rows and 2 columns


In [None]:
training_df.head()

Unnamed: 0,label,query
0,atis_flight,what flights are available from pittsburgh to...
1,atis_flight_time,what is the arrival time in san francisco for...
2,atis_airfare,cheapest airfare from tacoma to orlando
3,atis_airfare,round trip fares from pittsburgh to philadelp...
4,atis_flight,i need a flight tomorrow from columbus to min...


In [None]:
print("There are {} rows and {} columns".format(training_df.shape[0], training_df.shape[1]))

There are 4833 rows and 2 columns


## Preprocessing

### Text Cleaning

In [None]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import nltk
#lower text
training_df["lower_text"]= training_df.query.map(lambda x: x.lower())
training_df["lower_text"]= training_df.query.map(lambda x: x.lower())

# tokenise
from nltk import word_tokenize
training_df["tokenized"]= training_df.lower_text.map(word_tokenize)
training_df["tokenized"]= training_df.lower_text.map(word_tokenize)


# stemming
from nltk.stem import PorterStemmer

def normalize(text):
    return " ".join(text)

stemmer= PorterStemmer()

training_df["stemmed"]= training_df.selected.map(lambda xs: [stemmer.stem(x) for x in xs])
training_df["normalized"]= training_df.stemmed.apply(normalize)

test_df["stemmed"]= test_df.selected.map(lambda xs: [stemmer.stem(x) for x in xs])
test_df["normalized"]= test_df.stemmed.apply(normalize)

# tokenis with tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer= Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(training_df.normalized)

tokenized_train= tokenizer.texts_to_sequences(training_df.normalized)
tokenized_test= tokenizer.texts_to_sequences(training_df.normalized)

tokenizer.word_index.keys().__len__()

AttributeError: ignored

### Spacy encoding

In [None]:
!python -m spacy download en_core_web_md

2023-05-05 21:03:01.229712: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
import spacy
import numpy as np

nlp = spacy.load("en_core_web_md")
print("Number of vectors: {}".format(nlp.vocab.vectors_length))

Number of vectors: 300


In [None]:
sen_train = training_df['query'].tolist()
labels_train = training_df['label'].tolist()

sen_test = test_df['query'].tolist()
labels_test = test_df['label'].tolist()

sen_valid = valid_df['query'].tolist()
labels_valid = valid_df['label'].tolist()

In [None]:
def encode_sentences(sentences):
    n_sentences = len(sentences)
    X = np.zeros((n_sentences, 300))
    for idx, sentence in enumerate(sentences):
        doc = nlp(sentence)
        X[idx, :] = doc.vector
    return X
train_X_spacy = encode_sentences(sen_train)
test_X_spacy = encode_sentences(sen_test)
valid_X_spacy = encode_sentences(sen_valid)

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

labels_test = le.fit_transform(labels_test)
labels_train = le.fit_transform(labels_train)

### One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder as OHE
y_encoder= OHE().fit(np.array(training_df.label).reshape(-1,1))

In [None]:
ytr_encoded= y_encoder.transform(np.array(training_df.label).reshape(-1,1)).toarray()
yts_encoded= y_encoder.transform(np.array(test_df.label).reshape(-1,1)).toarray()

## MODELS

### SVM

In [None]:
from sklearn.svm import SVC
clf = SVC(decision_function_shape='ovo', kernel='rbf')
clf.fit(train_X_spacy, labels_train)

In [None]:
def validate_clf(X,y):
    y_pred = clf.predict(X)
    n_correct = 0
    for i in range(len(y)):
        if y_pred[i] == y[i]:
            n_correct += 1
    print("Predicted {} correctly out of {}".format(n_correct, len(y)))
    print("Model accuracy: {}%".format(round(n_correct/len(y)*100),2))
print('Validation on the train set results:')
validate_clf(train_X_spacy, labels_train)

Validation on the train set results:
Predicted 4600 correctly out of 4833
Model accuracy: 95%


In [None]:
print('Validation on the test set results:')
validate_clf(test_X_spacy, labels_test)

Validation on the test set results:
Predicted 769 correctly out of 799
Model accuracy: 96%


In [None]:
# Predicting on custom data
test_data = "Would you like to book round trip to India?"
op = clf.predict(encode_sentences([test_data]))
le.inverse_transform(op)

array(['atis_flight'], dtype='<U19')

### LSTM

In [None]:
import pandas as pd
train_data= pd.read_csv('atis_intents_train.csv', names= ["target", "text"])
test_data= pd.read_csv('atis_intents_test.csv', names= ["target", "text"])

In [None]:
# Resample was done merely by copy data where target are atis_flight_time and atis_quantity.
train_data= train_data.append(train_data.loc[train_data.target.isin(["atis_flight_time", "atis_quantity"]), :])

  train_data= train_data.append(train_data.loc[train_data.target.isin(["atis_flight_time", "atis_quantity"]), :])


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# pad text
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_padded= pad_sequences(tokenized_train, maxlen= 20, padding= "pre")
test_padded= pad_sequences(tokenized_test, maxlen= 20, padding= "pre")
train_padded.shape

(4939, 20)

In [None]:
#this function transform final processed text (columns padded) into 3D matrix (samples, steps, unique_words)
#matrix contents one hot encoded words. Encoding was done for each step and based on unique words

def transform_x(data, tokenizer):
  # 3D matrix with dimensions (samples, steps, unique_words)
    output_shape= [data.shape[0],
                  data.shape[1],
                  tokenizer.word_index.keys().__len__()]
    results= np.zeros(output_shape)

    for i in range(data.shape[0]):
        for ii in range(data.shape[1]):
            results[i, ii, data[i,ii]-1]= 1
    return results

xtr_transformed= transform_x(train_padded, tokenizer)
xts_transformed= transform_x(test_padded, tokenizer)

In [None]:
xtr_transformed.shape

(4939, 20, 654)

In [None]:
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy as CC
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.initializers import he_uniform, glorot_uniform
from tensorflow.keras.metrics import AUC
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2


class LSTMModel(object):

    def build_model(self, input_dim, output_shape, steps, dropout_rate, kernel_regularizer, bias_regularizer):
        input_layer= Input(shape= (steps, input_dim))

        #make lstm_layer
        lstm= LSTM(units= steps)(input_layer)
        dense_1= Dense(output_shape, kernel_initializer= he_uniform(),
                       bias_initializer= "zeros",
                       kernel_regularizer= l2(l= kernel_regularizer),
                       bias_regularizer= l2(l= bias_regularizer))(lstm)
        x= BatchNormalization()(dense_1)
        x= relu(x)
        x= Dropout(rate= dropout_rate)(x)
        o= Dense(output_shape, kernel_initializer= glorot_uniform(),
                 bias_initializer= "zeros",
                 kernel_regularizer= l2(l= kernel_regularizer),
                 bias_regularizer= l2(l= bias_regularizer))(dense_1)
        o= BatchNormalization()(o)
        output= softmax(o, axis= 1)

        loss= CC()
        metrics= AUC()
        optimizer= Adam()
        self.model= Model(inputs= [input_layer], outputs= [output])
        self.model.compile(optimizer= optimizer, loss= loss, metrics= [metrics])


    def train(self, x, y, validation_split, epochs):
        self.model.fit(x, y, validation_split= validation_split, epochs= epochs)

    def predict(self, x):
        return self.model.predict(x)

In [None]:
# Build model
steps= xtr_transformed.shape[1]
dim= xtr_transformed.shape[2]
output_shape= ytr_encoded.shape[1]

model= LSTMModel()
model.build_model(input_dim= dim,
                  output_shape= output_shape,
                  steps= steps,
                  dropout_rate= 0.5,
                  bias_regularizer= 0.3,
                  kernel_regularizer= 0.3)

In [None]:
model.train(xtr_transformed, ytr_encoded,
           0.2, 60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
 19/124 [===>..........................] - ETA: 2s - loss: 0.1988 - auc: 0.9999

In [None]:
# Evalutate
from sklearn.metrics import classification_report

prediction= y_encoder.inverse_transform(model.predict(xtr_transformed))
print(classification_report(train_data.target, prediction))

In [None]:
# Test
from sklearn.metrics import classification_report

prediction_test= y_encoder.inverse_transform(model.predict(xts_transformed))
print(classification_report(test_data.target, prediction_test))