# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from scipy import spatial
from nltk.corpus import stopwords

In [None]:
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [None]:
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers import Input, Dense
from keras.models import Sequential

In [None]:
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

In [None]:
train = pd.read_csv("../input/atis-airlinetravelinformationsystem/atis_intents_train.csv", header=None)
test = pd.read_csv("../input/atis-airlinetravelinformationsystem/atis_intents_test.csv", header=None)

# **Data Pre-Processing**

In [None]:
words = set(stopwords.words("english"))

In [None]:
count = 0
for elem in iter(words):
    count = count + 1
    if count == 20:
        break
    print (elem)

In [None]:
train.head()

In [None]:
test.head()

## Stopwords Corpus

In [None]:
train['text'] = train[1].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))
test['text'] = test[1].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))

## Digits Removal \d+

In [None]:
train['text'] = train['text'].str.replace('\d+', '')
test['text'] = test['text'].str.replace('\d+', '')

In [None]:
text = train['text']
labels = train[0]
test_text = test['text']
test_labels = test[0]

In [None]:
labels.nunique()

## Tokenize and Padding

In [None]:
from keras.preprocessing.text import Tokenizer
tok = Tokenizer()
tok.fit_on_texts(text)
word_index = tok.word_index

Indexed each words as there are 631 chars, words are listed to 0-631

In [None]:
max_vocab_size = len(word_index) + 1
input_length = 25

In [None]:
train_data_tokens = tok.texts_to_sequences(text)
test_data_tokens = tok.texts_to_sequences(test_text)

### Tokenized each word based off of word index

In [None]:
train_input = pad_sequences(train_data_tokens, input_length)
test_input = pad_sequences(test_data_tokens, input_length)

### Padded each sentence (text) to the same size of 25

## LabelEncoder
What this does is that it encodes the text in the data into numbers
For example if we have a columnb in our data named: Gender which takes on the values: Male, Female, Other, then the Label Encoder converts these values to 1, 2,  and 3 becasue only numeric data can be operated upon by a computer.
Here, Male-1, Female-2, Other-3


In [None]:
label_transformer = preprocessing.LabelEncoder()
label_transformer.fit(labels)

In [None]:
# from sklearn.externals import joblib
# joblib.dump(label_transformer, 'atis-airlinetravelinformationsystem/label_encoder.pk1')


In [None]:
labels = label_transformer.transform(labels)
test_labels = label_transformer.transform(test_labels)

In [None]:
labels = to_categorical(np.asarray(labels))
test_labels = to_categorical(np.asarray(test_labels))

## Train Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_input, labels, test_size=0.2, random_state=1)

In [None]:
X_train

# Word Embeddings 

Embedded Index saves the info from pretrained GloVe model which can be later used for word embedding in terms of its
application to our specific model. Our embedded matrix is first matrix of zeros, and then updated according to the 
our dataset-GloVe dataset comparison.

In [None]:
embedded_dim = 300
embedded_index = dict()

with open('../input/glove42b300dtxt/glove.42B.300d.txt', 'r', encoding='utf-8') as glove:
    for line in glove:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedded_index[word] = vector

In [None]:
glove.close

In [None]:
embedded_matrix = np.zeros((max_vocab_size, embedded_dim))
for x, i in word_index.items():
    vector = embedded_index.get(x)
    if vector is not None:
        embedded_matrix[i] = vector

# CNN for NLP task

As words and their sequence are important for NLP solutions, pixels and their order are also essential and something valubale to keep in mind while training

In [None]:
model = Sequential()
model.add(Embedding(max_vocab_size, 300, input_length=input_length, weights=[embedded_matrix], trainable=False))

In [None]:
model.add(Conv1D(filters=32, kernel_size=8, activation='selu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='selu'))
model.add(Dense(8, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, verbose=2)

In [None]:
model.evaluate(X_val, y_val)

In [None]:
def acc(y_true, y_pred):
    return np.equal(np.argmax(y_true, axis=-1), np.argmax(y_pred, axis=-1)).mean()

In [None]:
predictions = model.predict(test_input)

In [None]:
print(acc(test_labels, predictions))

# # Thanks for reading it to the end. Credits to the OpenSourceCommunity