In [1]:
#importing libraries
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [0]:
#each no  in intent column corresponds to a specific topic label
#there are twenty topics from 0 to 19
#encoding in latin1 where each character is exactly one byte long
df = pd.read_csv("tweets[labeled].csv", encoding = "latin1", names = ["Tweets", "Labelled Intent"])
intent = df["Labelled Intent"]
#it has set of all unique intents which are from 0 to 19
unique_intent = list(set(intent))
tweets = list(df["Tweets"])

In [0]:
#print first 5 rows
print(df.head())

                                              Tweets  Labelled Intent
0  check bank linking status adhar bank adharcard...                6
1  please come village share adharcard get detail...               18
2  modi gonna give adharcard complaining wrong pe...               15
3  adharcard basically model incindia completely ...                8
4  havent updated adharcard book online appointme...               10


In [0]:
#As tweets data is already cleaned and processed
#Converting tweets data into list of tweets where each tweet has list of words
def preprocessing(data):
  set_of_tweets=[]
  for line in data:
    set_of_words=[]
    for word in line.split(' '):
      set_of_words.append(word)
    set_of_tweets.append(set_of_words)
  return set_of_tweets

In [0]:
preprocessed_tweets=preprocessing(tweets)
print(preprocessed_tweets)



In [0]:
#tokenizing
def tokenize(data, filters = ','):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(data)
  return token

In [0]:
tokens = tokenize(preprocessed_tweets)
print(tokens.word_index)



In [0]:
dictionary_size = len(tokens.word_index) + 1
#getting maxmimum length of tweets which is 15
#as we have selected each tweet whose length ranges from 5 to 15 for better results
max_length = (len(max(preprocessed_tweets, key = len)))
print("dictionary size = %d and Maximum length = %d" % (dictionary_size, max_length))

dictionary size = 13263 and Maximum length = 15


In [0]:
#convert words into indexes
vector_data=(tokens.texts_to_sequences(preprocessed_tweets))

In [0]:
vector_data[:5]

[[71, 12, 59, 34, 44, 12, 3, 5],
 [15, 156, 1772, 313, 3, 10, 40, 7, 4, 4917],
 [54, 1033, 64, 3, 2542, 243, 101],
 [3, 1773, 2543, 366, 1774, 148, 195],
 [1238, 102, 3, 430, 9, 556, 4918, 5]]

In [0]:
#making each tweet of same size i.e of 15 by padding
#padding="post" will add 0 at the last index till the size become 15 of each tweet
padded_vector_data=(pad_sequences(vector_data, maxlen = max_length, padding = "post"))

In [0]:
#clearly from the o/p first tweet has 8 words so 7 zero's are padded
print(padded_vector_data[0:2])
#shape of padded vector data
print(padded_vector_data.shape)

[[  71   12   59   34   44   12    3    5    0    0    0    0    0    0
     0]
 [  15  156 1772  313    3   10   40    7    4 4917    0    0    0    0
     0]]
(8390, 15)


In [0]:
print(unique_intent)
print(intent[0:5])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
0     6
1    18
2    15
3     8
4    10
Name: Labelled Intent, dtype: int64


In [0]:
vector_output=np.array(intent).reshape(len(intent), 1)
#8390 rows with 1 column which has value as intent
print(vector_output.shape)

(8390, 1)


In [0]:
output_level_encoding = OneHotEncoder(sparse = False).fit_transform(vector_output)

In [0]:
#shape of output vector after encoding with OneHotEncoder
print(output_level_encoding.shape)
#visualize, clearly first and second tweet has topic level as 6 and 18 respectively. 
#At the respective position value is 1 and for rest nineteen values 0.
print(output_level_encoding[0:5])

(8390, 20)
[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [0]:
#data is ready to fit in our model

In [0]:
#spliting dataset into 75% training and 25% validation set
train_X, validation_X, train_Y, validation_Y = train_test_split(padded_vector_data, output_level_encoding, shuffle = True, test_size = 0.25)

In [0]:
#getting dimension of training and validation set
print(train_X.shape, train_Y.shape)
print(validation_X.shape, validation_Y.shape)

(6292, 15) (6292, 20)
(2098, 15) (2098, 20)


In [0]:
#model creation of intent classifier

In [0]:
#defining model
#creating model layers by layers using sequential()
#each layer has exactly one input tensor and one output tensor
def model(dictionary_size,length):
  m=Sequential()
  m.add(Embedding(dictionary_size, 128, input_length=length, trainable=False))
  m.add(Bidirectional(LSTM(128)))
  m.add(Dense(32, activation = "relu"))
  m.add(Dropout(0.5))
  m.add(Dense(20, activation = "softmax"))
  return m

In [0]:
model=model(dictionary_size,max_length)
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
#summarization of created model
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 15, 128)           1697664   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                660       
Total params: 1,969,716
Trainable params: 272,052
Non-trainable params: 1,697,664
_________________________________________________________________


In [0]:
checkpoint=ModelCheckpoint('model.h',
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')
hist=model.fit(train_X,
               train_Y,
               epochs=50,
               batch_size=16,
               validation_data = (validation_X, validation_Y),
               callbacks = [checkpoint])

Train on 6292 samples, validate on 2098 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 2.33933, saving model to model.h
Epoch 2/50

Epoch 00002: val_loss improved from 2.33933 to 2.09537, saving model to model.h
Epoch 3/50

Epoch 00003: val_loss improved from 2.09537 to 1.95481, saving model to model.h
Epoch 4/50

Epoch 00004: val_loss improved from 1.95481 to 1.86874, saving model to model.h
Epoch 5/50

Epoch 00005: val_loss improved from 1.86874 to 1.79793, saving model to model.h
Epoch 6/50

Epoch 00006: val_loss did not improve from 1.79793
Epoch 7/50

Epoch 00007: val_loss improved from 1.79793 to 1.72948, saving model to model.h
Epoch 8/50

Epoch 00008: val_loss improved from 1.72948 to 1.70119, saving model to model.h
Epoch 9/50

Epoch 00009: val_loss did not improve from 1.70119
Epoch 10/50

Epoch 00010: val_loss did not improve from 1.70119
Epoch 11/50

Epoch 00011: val_loss improved from 1.70119 to 1.65278, saving model to model.h
Epoch 12/50

Epoch 00012: val

In [0]:
 #loading created model.h
 model=load_model("model.h")

In [0]:
def prediction(tweet, labels):
  list_of_word=[]
  for w in tweet.split(' '):
    list_of_word.append(w)
  print(list_of_word)
  vector=(tokens.texts_to_sequences(list_of_word))
  print(vector)
  #Check for unknown words
  if [] in vector:
    vector = list(filter(None, vector))
  vector=np.array(vector).reshape(1, len(vector))
  x = (pad_sequences(vector, maxlen = max_length, padding = "post"))
  print(x)
  predictions=model.predict_proba(x)[0]
  labels=np.array(labels)
  ids=np.argsort(-predictions)
  labels=labels[ids]
  predictions=-np.sort(-predictions)
  for i in range(model.predict_proba(x).shape[1]):
    print("%s has confidence = %s" % (labels[i], (predictions[i])))

In [0]:
#label(0-19) whichever has the highest confidence shows the intent or from which topic out of 20 topics the tweet belongs to
test_tweet = "check bank linking status adhar bank adharcard uidai"
prediction(test_tweet, unique_intent)

['check', 'bank', 'linking', 'status', 'adhar', 'bank', 'adharcard', 'uidai']
[[71], [12], [59], [34], [44], [12], [3], [5]]
[[71 12 59 34 44 12  3  5  0  0  0  0  0  0  0]]
6 has confidence = 0.92977935
5 has confidence = 0.045095954
15 has confidence = 0.016878601
4 has confidence = 0.002817294
2 has confidence = 0.0025134007
13 has confidence = 0.0011386273
0 has confidence = 0.00037292214
10 has confidence = 0.00037291145
9 has confidence = 0.00032905117
8 has confidence = 0.00028335676
19 has confidence = 0.00013737014
12 has confidence = 0.00012903377
3 has confidence = 5.8452002e-05
11 has confidence = 4.8939328e-05
14 has confidence = 3.457728e-05
18 has confidence = 5.296598e-06
1 has confidence = 4.1214166e-06
7 has confidence = 6.1035695e-07
16 has confidence = 1.18580864e-07
17 has confidence = 1.9468023e-12
