### Text Classification - Call summary

Given data of telephone conversation text and the category or purpose of the call, the problem is to classify the incoming calls based on the call data into one of those categories.

Multi-class classification problem

#### Importing Necessary Libraries

In [2]:
import os

import pandas as pd
import numpy as np

from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.models import Sequential,Model
from keras.layers import Dense,Flatten,Embedding,Input,Conv1D,MaxPooling1D,LSTM,SpatialDropout1D,Dropout
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model


np.random.seed(1512)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
PATH = os.getcwd()
os.listdir()

['.ipynb_checkpoints',
 'backup_submissions',
 'cnn_embed_model.csv',
 'Cute_Embeddings_CNN.ipynb',
 'Cute_Embeddings_Final_Code.ipynb',
 'Cute_Embeddings_MLP.ipynb',
 'Cute_Embeddings_RNN.ipynb',
 'Cute_mlp_tfidf.ipynb',
 'description.pdf',
 'glove.6B',
 'glove.6B.zip',
 'mlp_embed_model.csv',
 'mlp_embed_model_1.csv',
 'mlp_embed_model_2.csv',
 'References',
 'rnn_do_model_1.csv',
 'rnn_do_model_2.csv',
 'rnn_model_1.csv',
 'rnn_model_2.csv',
 'samplesubmission.csv',
 'samplesubmissionbestmlp.csv',
 'samplesubmissiontest.csv',
 'SampleSubmission_mlp.csv',
 'test.csv',
 'train.csv']

#### Reading the data

In [4]:
# Create pandas dataframe from csv input file
inputData = pd.read_csv("train.csv",index_col = "ID",na_values=" ")
inputData.columns

testData = pd.read_csv("test.csv",na_values=" ")
testData.columns

Index(['ID', 'converse'], dtype='object')

In [5]:
# replacing na values with string "blank calls"
inputData = inputData.replace(np.nan,"Blank calls")
testData = testData.replace(np.nan, "Blank Calls")

In [6]:
inputData['categories'].value_counts()

PRESCRIPTION     12077
APPOINTMENTS     11098
MISCELLANEOUS     9736
ASK_A_DOCTOR      9440
LAB               3457
JUNK                17
Name: categories, dtype: int64

#### Getting list of categories

In [7]:
categories = inputData['categories'].unique()
categories.sort()

In [8]:
# tokenizer 
tokenizer = Tokenizer()
tokenizer.fit_on_texts(inputData['converse'])

word_Index = tokenizer.word_index
vocab_size = len(word_Index)+1

print("unique words ",vocab_size)

unique words  34770


### Train/Validation Split (85:15)

In [9]:
X_train, X_validate, y_train, y_validate = model_selection.train_test_split(inputData['converse'],inputData['categories'], random_state = 1512, stratify = inputData['categories'])

In [10]:
X_test = testData['converse']

In [11]:
# performing padding to make all sequences of similar length 
MAX_SEQUENCE_LENGTH = 150

train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen = MAX_SEQUENCE_LENGTH)
valid_seq = pad_sequences(tokenizer.texts_to_sequences(X_validate), maxlen = MAX_SEQUENCE_LENGTH)
test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen= MAX_SEQUENCE_LENGTH)

print(train_seq.shape)
print(valid_seq.shape)

(34368, 150)
(11457, 150)


#### Embeddings - Glove 6B 100d 

In [12]:
# embeddings - glove 6B 100d
Embeddings_Index = {}
for line in open('glove.6B/glove.6B.100d.txt', encoding= 'utf-8'):
    values = line.split()
    Embeddings_Index[values[0]] = np.asarray(values[1:],dtype='float32')



In [13]:
# map words to embeddings
embeddings_matrix = np.zeros((vocab_size,100))
for word,i in word_Index.items():
    vec = Embeddings_Index.get(word)
    if vec is not None:
        embeddings_matrix[i] = vec
        

In [14]:
embeddings_matrix.shape


(34770, 100)

In [15]:
# Creating Embedding layer
embeddingLayer = Embedding(vocab_size,
                           100,
                           weights = [embeddings_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable = False)


In [24]:
# Function to convert probabilities to categories 
def convertClassToName(test_preds):
    y_pred = []
    for i in test_preds:
        num = np.argmax(i)
        y_pred.append(categories[num])
    return y_pred

# Function to write output of model to csv file
def writeOuputToCsv (model, filename):
    test_preds = model.predict(test_seq)
    y_pred = convertClassToName(test_preds)
    data = {'ID' : testData['ID'], 'categories' : y_pred}
    outputDf = pd.DataFrame(data= data, columns =['ID','categories'])
    outputDf.to_csv(filename,index=False)

### MLP model (1  hidden layer)

In [17]:
inputLayer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
embedded_sequences = embeddingLayer(inputLayer)
dense_1 = Dense(64, activation='relu')(embedded_sequences)
flatten = Flatten()(dense_1)
preds = Dense(len(inputData['categories'].unique()), activation='softmax')(flatten)

mlp_embed_model = Model(inputLayer,preds)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [18]:
mlp_embed_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 100)          3477000   
_________________________________________________________________
dense_1 (Dense)              (None, 150, 64)           6464      
_________________________________________________________________
flatten_1 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 57606     
Total params: 3,541,070
Trainable params: 64,070
Non-trainable params: 3,477,000
_________________________________________________________________


In [19]:
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_validate = encoder.fit_transform(y_validate)

In [20]:
y_train = to_categorical(y_train)
y_validate = to_categorical(y_validate)

In [21]:
mlp_embed_model.compile(optimizer="adam", loss="categorical_crossentropy",metrics =['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [25]:
hist_mlp_embed_model = mlp_embed_model.fit(train_seq, y_train, epochs=50, validation_data=(valid_seq,y_validate))

Train on 34368 samples, validate on 11457 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [26]:
writeOuputToCsv(mlp_embed_model,"mlp_embed_model_2.csv")

### CNN Model - Embeddings

In [27]:
## cnn model with embedding layer
input_seq = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype = "int32")
embedding_seq = embeddingLayer(input_seq)
x = Conv1D(64, 5, activation='relu')(embedding_seq)
x = MaxPooling1D(4)(x)
x = Conv1D(64, 5, activation='relu')(x)
x = MaxPooling1D(4)(x)
x = Conv1D(64, 5, activation='relu')(x)
x = MaxPooling1D(4)(x)  # global max pooling
x = Flatten()(x)
x = Dense(64, activation='relu')(x)
preds = Dense(len(categories), activation='softmax')(x)

cnn_embed_model = Model(input_seq,preds)


In [28]:
cnn_embed_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [29]:
cnn_embed_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 100)          3477000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 146, 64)           32064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 36, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 32, 64)            20544     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 8, 64)             0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 4, 64)             20544     
__________

In [31]:
callbacks = [EarlyStopping(monitor='val_acc', patience=3),
             ModelCheckpoint(filepath='best_cnn_model.h5', monitor='val_acc', save_best_only=True)]
hist_cnn_embed_model = cnn_embed_model.fit(train_seq,y_train,epochs = 25,callbacks=callbacks,validation_data=(valid_seq,y_validate))

Train on 34368 samples, validate on 11457 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25


In [32]:
best_cnn_model = load_model('best_cnn_model.h5')
writeOuputToCsv(best_cnn_model,"cnn_embed_model.csv")

### RNN- LSTM Embeddings - Model1

In [34]:
#simple rnn with embedding layer
rnn_model = Sequential()
rnn_model.add(Embedding(vocab_size,
                           100,
                           weights = [embeddings_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable = False))
rnn_model.add(LSTM(100))
rnn_model.add(Dense(len(categories),activation='softmax'))

In [35]:
rnn_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [36]:
rnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 100)          3477000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 606       
Total params: 3,558,006
Trainable params: 81,006
Non-trainable params: 3,477,000
_________________________________________________________________


In [37]:
callbacks = [EarlyStopping(monitor='val_acc', patience=3),
             ModelCheckpoint(filepath='best_rnn_model.h5', monitor='val_acc', save_best_only=True)]
hist_rnn_model = rnn_model.fit(train_seq,y_train,epochs = 50,callbacks= callbacks, validation_data=(valid_seq,y_validate))

Train on 34368 samples, validate on 11457 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


In [38]:
best_rnn_model = load_model('best_rnn_model.h5')
writeOuputToCsv(best_rnn_model,"rnn_model_2.csv")

### RNN- LSTM Embeddings - Model2 ( More dense layers)


In [39]:
#rnn including dropout
input_seq = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype = "int32")
embedding_seq = embeddingLayer(input_seq)
embedding_seq = SpatialDropout1D(0.3)(embedding_seq)
#LSTM layer
lstm_layer = LSTM(100)(embedding_seq)
#dense layers
output_layer1 = Dense(50, activation="relu")(lstm_layer)
output_layer1 = Dropout(0.25)(output_layer1)
preds = Dense(len(categories), activation='softmax')(output_layer1)

rnn_dropout_model = Model(input_seq,preds)
rnn_dropout_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [40]:
rnn_dropout_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 100)          3477000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 150, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_6 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 306       
Total para

In [41]:
callbacks = [EarlyStopping(monitor='val_acc', patience=2),
             ModelCheckpoint(filepath='best_rnn_dropout_model_1.h5', monitor='val_acc', save_best_only=True)]
hist_rnn_dropout_model = rnn_dropout_model.fit(train_seq,y_train,epochs = 50,callbacks= callbacks, validation_data=(valid_seq,y_validate))

Train on 34368 samples, validate on 11457 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50


In [42]:
best_rnn_do_model = load_model('best_rnn_dropout_model_1.h5')
writeOuputToCsv(best_rnn_do_model,"rnn_do_model_2.csv")