In [16]:
import numpy as np
import pandas as pd
import nltk
import re


from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing


In [6]:
raw_dataframe = pd.read_csv("nlpdata.txt",sep=",,,",header=None ,names=['inquiry','ans']) #Our data is separate by ,,,, so that's gonna be our sep, we don't need header, and inquiry and ans will be our column names

  """Entry point for launching an IPython kernel.


In [7]:
raw_dataframe['ans']=raw_dataframe['ans'].str.strip() # Trying to clean all whitespaces, and blank char

In [8]:
raw_dataframe['inquiry'] = raw_dataframe['inquiry'].apply(lambda x: x.lower())#Converting all characters to lower, as capital char isn't gonna affect it's meaning
raw_dataframe['inquiry'] = raw_dataframe['inquiry'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) #Trying to replace all the unwanted characters, with space.

In [9]:
percentage_validation=0.20 #Keeping 20% of the data for validation set

## Main Recurrent Neural Network Architecture

In [13]:
max_words_capacity = 20000 #Total number of words, in the embedding
max_sentence_length = 30 # Max number of characters in the sentences

In [14]:
data=raw_dataframe.copy()

In [17]:
print(data['ans'].value_counts()) # Just checking if data isn't skewed.

tokenizer = Tokenizer(num_words=max_words_capacity, split=' ') # Splitting the sentences based on gaps 
tokenizer.fit_on_texts(data['inquiry'].values) # Fitting the tokenizer on text
X = tokenizer.texts_to_sequences(data['inquiry'].values) # Finally converting strings to integer
X = pad_sequences(X, maxlen=max_sentence_length) #Padding sequence to maximum length

what           609
who            402
unknown        272
affirmation    104
when            96
Name: ans, dtype: int64


In [19]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Y = data['ans']
le = preprocessing.LabelEncoder()
le.fit(Y)
Y=le.transform(Y) 
labels = to_categorical(np.asarray(Y))
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', labels.shape)


# split the data into a training set and a validation set
indices = np.arange(X.shape[0])
np.random.seed(0)
np.random.shuffle(indices)
X = X[indices]
labels = labels[indices]
total_val_samples = int(percentage_validation * X.shape[0])

x_train = X[:-total_val_samples]
y_train = labels[:-total_val_samples]
x_val = X[-total_val_samples:]
y_val = labels[-total_val_samples:]

Found 3685 unique tokens.
Shape of data tensor: (1483, 30)
Shape of label tensor: (1483, 5)


In [23]:
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [24]:
word_dim=100

In [25]:
embedding_matrix = np.zeros((len(word_index) + 1, word_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [26]:

embedding_layer = Embedding(len(word_index) + 1,
                            word_dim,
                            weights=[embedding_matrix],
                            input_length=max_sentence_length,
                            trainable=False)

In [35]:
#In this case, a uni layer one directional LSTM worked out pretty good, it'll have 200 layers, with loss function as categorical cross entropy
# dropout will 0.2
lstm_out = 196

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 100)           368600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               232848    
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 985       
Total params: 602,433
Trainable params: 233,833
Non-trainable params: 368,600
_________________________________________________________________
None


In [36]:
model.fit(x_train, y_train,
          batch_size=50,
          epochs=25,
          validation_data=(x_val, y_val))

Train on 1187 samples, validate on 296 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fa3043181d0>

In [None]:
model.save_weights('archana.h5')

In [28]:
model.load_weights('archana.h5')

In [29]:
example = tokenizer.texts_to_sequences(["What time does the train leave"])
example = pad_sequences(example, maxlen=max_sentence_length)

In [30]:
le.inverse_transform(np.argmax(model.predict(example)))

  if diff:


'when'