# Degree of Urgency based on ticket's comment with LSTM Network in Keras

In [1]:
# Import the necessary libraries, modules
import pandas as pd # Pandas library for reading '.csv' files as dataframes
import numpy as np  # Numpy library for creating and modifying arrays.
from keras.layers import Dense, SimpleRNN, GRU, LSTM, Embedding # Import layers from Keras
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Reading the data

In [2]:
raw_data = pd.read_csv('train.csv', encoding='latin-1') # Read the data as a DataFrame using Pandas
raw_test_data = pd.read_csv('Validation.csv', encoding='latin-1')

print(raw_data.shape) # Print the dimensions of train DataFrame
print(raw_data.columns) # Print the column names of the DataFrame
print('\n')
raw_data.head(5) # Print the top few records

(43694, 9)
Index(['title', 'body', 'ticket_type', 'category', 'sub_category1',
       'sub_category2', 'business_service', 'urgency', 'impact'],
      dtype='object')




Unnamed: 0,title,body,ticket_type,category,sub_category1,sub_category2,business_service,urgency,impact
0,car allowance record,october pm allowance record hello think july s...,1,4,3,0,40,3,4
1,project resources decommission write,october pm resources decommission hello please...,1,4,2,87,4,3,4
2,access to the internal,thursday hello writing ask question regarding ...,1,6,22,7,41,3,4
3,new project code fusion,code hi please create code commercial kicking ...,1,4,3,7,70,3,4
4,password reset for,re available has assigned hi guys did till rec...,1,4,2,88,4,3,4


### Check the "urgency" and their frequencies

In [3]:
pd.value_counts(raw_data['urgency'])

3    31159
1     6073
2     4975
0     1487
Name: urgency, dtype: int64

4 class classification problem

### Converting unstructured text to structured numeric form
This includes:
1. Tokenizing
2. Converting sequence of words to sequence of word indices
3. Converting varing length sequences to fixed length sequences through padding

In [4]:
max_num_words = 10000
seq_len = 50
embedding_size = 100

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(raw_data.body)

In [6]:
x_train = tokenizer.texts_to_sequences(raw_data.body)

In [29]:
x_train

array([[   0,    0,    0, ...,  444,  450,    9],
       [   0,    0,    0, ...,    4,   39,   25],
       [   0,    0,    0, ...,   12,    4,  107],
       ...,
       [   0,    0,    0, ..., 1340,    9,  638],
       [   0,    0,    0, ...,   86,   21,    9],
       [   0,    0,    0, ...,  656,  896,   56]])

In [8]:
pd.set_option('display.max_colwidth', -1)

In [27]:
raw_data.head(5)

Unnamed: 0,title,body,ticket_type,category,sub_category1,sub_category2,business_service,urgency,impact
0,car allowance record,october pm allowance record hello think july seems incorrect allowance record amount effective st july inserted about employees sheet attached applies employees possible records corrected by script incorrect each employees record thanks,1,4,3,0,40,3,4
1,project resources decommission write,october pm resources decommission hello please log several calls resources decommission please log call every resource needed closed questions please let thank best regards senior engineer,1,4,2,87,4,3,4
2,access to the internal,thursday hello writing ask question regarding right zone awards application station please provide urgent because preparing demo lot application functionalities based kind regards developer,1,6,22,7,41,3,4
3,new project code fusion,code hi please create code commercial kicking off client code requested vice president,1,4,3,7,70,3,4
4,password reset for,re available has assigned hi guys did till receive also work please status hello since then forward order per procedure please continue follow instructions dear please follow procedure unlock help her ahead best regards senior engineer tuesday pm available has assigned hi did remitted by yourself works nowhere also,1,4,2,88,4,3,4


In [30]:
tokenizer.word_index

{'please': 1,
 'pm': 2,
 'hi': 3,
 'regards': 4,
 'thank': 5,
 'hello': 6,
 'you': 7,
 're': 8,
 'thanks': 9,
 'for': 10,
 'sent': 11,
 'kind': 12,
 'help': 13,
 'tuesday': 14,
 'wednesday': 15,
 'dear': 16,
 'thursday': 17,
 'friday': 18,
 'best': 19,
 'have': 20,
 'by': 21,
 'with': 22,
 'can': 23,
 'july': 24,
 'engineer': 25,
 'error': 26,
 'has': 27,
 'ext': 28,
 'issue': 29,
 'log': 30,
 'be': 31,
 'let': 32,
 'attached': 33,
 'date': 34,
 'change': 35,
 'october': 36,
 'information': 37,
 'we': 38,
 'senior': 39,
 'also': 40,
 'november': 41,
 'add': 42,
 'form': 43,
 'details': 44,
 'name': 45,
 'order': 46,
 'your': 47,
 'analyst': 48,
 'access': 49,
 'leaver': 50,
 'update': 51,
 'december': 52,
 'number': 53,
 'code': 54,
 'could': 55,
 'officer': 56,
 'if': 57,
 'site': 58,
 'provide': 59,
 'leave': 60,
 'work': 61,
 'march': 62,
 'client': 63,
 'create': 64,
 'high': 65,
 'report': 66,
 'issues': 67,
 'si': 68,
 'or': 69,
 'did': 70,
 'but': 71,
 'days': 72,
 'february': 7

### Preprocessing the text data - Input

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(raw_data.body) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(raw_data.body) #text_to_sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=50) #pad_sequences makes every sequence a fixed size list by padding with 0s 
x_test = tokenizer.texts_to_sequences(raw_test_data.body) 
x_test = pad_sequences(x_test, maxlen=50)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test  

((43694, 50), (4855, 50))

### Target vectors for the network - Output

In [12]:
unique_labels = list(raw_data.urgency.unique())
print(unique_labels)

[3, 2, 1, 0]


In [13]:
from keras.utils import to_categorical # This convers the labels to one-hot vectors(Dummies)

y_train = np.array([unique_labels.index(i) for i in raw_data.urgency]) # Convert the word labels to indices
y_train = to_categorical(y_train) # Dummify the labels
y_test = np.array([unique_labels.index(i) for i in raw_test_data.urgency])
y_test = to_categorical(y_test)

In [14]:
y_train

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]], dtype=float32)

### Building and training Simple RNN model

In [101]:
model = Sequential() # Call Sequential to initialize a network
model.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
model.add(SimpleRNN(10, return_sequences=True)) 
model.add(SimpleRNN(5, return_sequences=False))
model.add(Dense(4, activation='softmax')) # Add an ouput layer. Since classification, 3 nodes for 3 classes.

In [102]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 50, 10)            1110      
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 5)                 80        
_________________________________________________________________
dense_8 (Dense)              (None, 4)                 24        
Total params: 1,001,214
Trainable params: 1,001,214
Non-trainable params: 0
_________________________________________________________________


In [22]:
# Mention the optimizer, Loss function and metrics to be computed
model.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history

model.fit(x_train, y_train, epochs=5, validation_split=0.25)

Train on 32770 samples, validate on 10924 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c3f8b02f60>

### Prediction and evaluation on test data

In [23]:
test_prob = model.predict(x_test)
test_prob.shape

(4855, 4)

In [24]:
test_prob[:5]

array([[9.9999547e-01, 4.2865267e-06, 1.4892234e-09, 2.2147822e-07],
       [1.0574572e-05, 9.7472048e-01, 2.5190953e-02, 7.7963123e-05],
       [9.9999642e-01, 3.3185306e-06, 1.0920785e-09, 1.9775572e-07],
       [9.9999630e-01, 3.4033394e-06, 1.0949212e-09, 1.9882926e-07],
       [3.1616689e-06, 5.8413321e-01, 4.1544658e-01, 4.1705585e-04]],
      dtype=float32)

In [25]:
test_classes = model.predict_classes(x_test)
test_classes[:10]

array([0, 1, 0, 0, 1, 0, 2, 0, 0, 2], dtype=int64)

### Building and training LSTM model

In [15]:
# Building an LSTM model
model = Sequential() # Call Sequential to initialize a network
model.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector.
model.add(LSTM(10, return_sequences=True)) #Adding a LSTM hidden layer.
model.add(LSTM(5, return_sequences=False)) #Adding another LSTM hidden layer. FALSE because nest layer is output layer
model.add(Dense(4, activation='softmax')) #Adding an ouput layer. Since classification, 4 nodes for 4 classes.

Instructions for updating:
Colocations handled automatically by placer.


In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 10)            4440      
_________________________________________________________________
lstm_2 (LSTM)                (None, 5)                 320       
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 24        
Total params: 1,004,784
Trainable params: 1,004,784
Non-trainable params: 0
_________________________________________________________________


In [17]:
from keras.optimizers import Adam
adam = Adam(lr=0.001)

In [18]:
# Mention the optimizer, Loss function and metrics to be computed
model.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history

model.fit(x_train, y_train, epochs=40, validation_split=0.25)

Instructions for updating:
Use tf.cast instead.
Train on 32770 samples, validate on 10924 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1c3f1a74588>

### Prediction and evaluation on test data
2. Predict the class labels on test data
2. Evaluate the model on test data

Hint: Check model.predict, model.predict_classes, model.evaluate in keras

In [19]:
test_prob = model.predict(x_test)
test_prob.shape

(4855, 4)

Predict the class labels on test data

In [20]:
test_prob[:5]

array([[9.9999654e-01, 3.2157525e-06, 1.5644884e-09, 2.4079398e-07],
       [7.2934668e-06, 9.8166478e-01, 1.8230893e-02, 9.7125368e-05],
       [9.9999666e-01, 3.0581512e-06, 1.4685635e-09, 2.3275894e-07],
       [9.9999678e-01, 2.9832042e-06, 1.4320785e-09, 2.2635416e-07],
       [1.9292272e-06, 3.5250369e-01, 6.4709890e-01, 3.9546384e-04]],
      dtype=float32)

In [21]:
test_classes = model.predict_classes(x_test)
test_classes[:5]

array([0, 1, 0, 0, 2], dtype=int64)