In [34]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py

import keras
from keras import optimizers 
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding 
from keras .models import Sequential 
from keras.preprocessing.text import Tokenizer          
from keras.preprocessing.sequence import pad_sequences  
from keras.utils import to_categorical                  

In [2]:
train= pd.read_csv("train.csv",header=0)
test = pd.read_csv("test.csv",header=0)

In [3]:
train.categories.unique()

array(['QUERIES FROM PHARMACY', 'NEW APPOINTMENT', 'OTHERS',
       'MEDICATION RELATED',
       'SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)', 'REFILL',
       'PRIOR AUTHORIZATION', 'RESCHEDULING', 'SYMPTOMS', 'LAB RESULTS',
       'FOLLOW UP ON PREVIOUS REQUEST', 'PROVIDER', 'CHANGE OF PROVIDER',
       'SHARING OF LAB RECORDS (FAX, E-MAIL, ETC.)',
       'QUERY ON CURRENT APPOINTMENT', 'RUNNING LATE TO APPOINTMENT',
       'CANCELLATION', 'CHANGE OF PHARMACY', 'QUERIES FROM INSURANCE FIRM',
       'JUNK', 'CHANGE OF HOSPITAL'], dtype=object)

In [4]:
train=train.dropna(axis=0)
print(train.shape)

(48667, 2)


In [5]:
train.categories.value_counts()

MEDICATION RELATED                               9010
NEW APPOINTMENT                                  8907
REFILL                                           8347
OTHERS                                           6232
SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)    3018
LAB RESULTS                                      2253
PROVIDER                                         1677
QUERIES FROM PHARMACY                            1464
RESCHEDULING                                     1382
SHARING OF LAB RECORDS (FAX, E-MAIL, ETC.)       1212
PRIOR AUTHORIZATION                              1043
SYMPTOMS                                         1021
CHANGE OF PROVIDER                                811
RUNNING LATE TO APPOINTMENT                       590
CANCELLATION                                      563
QUERY ON CURRENT APPOINTMENT                      559
FOLLOW UP ON PREVIOUS REQUEST                     304
CHANGE OF HOSPITAL                                127
QUERIES FROM INSURANCE FIRM 

In [6]:
train.converse=train.converse.astype('str')
test.converse=test.converse.astype('str')

In [7]:
train.categories=train.categories.astype("category")

In [14]:
MAX_SEQUENCE_LENGTH = 463# Sequence length of each sentence. If more, crop. If less, pad with zeros
MAX_NB_WORDS = 20000 

In [15]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)   # get the frequently occuring words
tokenizer.fit_on_texts(train.converse)           
train_sequences = tokenizer.texts_to_sequences(train.converse)
test_sequences = tokenizer.texts_to_sequences(test.converse)

word_index = tokenizer.word_index               # dictionary containing words and their index
# print(tokenizer.word_index)                   # print to check
print('Found %s unique tokens.' % len(word_index)) # total words in the corpus
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH) # get only the top frequent words on train
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)   # get only the top frequent words on test

print(train_data.shape)
print(test_data.shape)

Found 35747 unique tokens.
(48667, 463)
(8581, 463)


In [16]:
train_label = list(train.categories.unique())

In [17]:
train_y=np.array([train_label.index(i) for i in train.categories])

In [19]:
print(train_y.shape)
train_y=to_categorical(train_y)
train_y.shape

(48667,)


(48667, 21)

In [20]:
print(train_data.shape, test_data.shape)

((48667, 463), (8581, 463))


In [21]:
mlp = Sequential()
mlp.add(Dropout(0.2, input_shape=(463,)))
mlp.add(Dense(30, activation='sigmoid'))
mlp.add(Dropout(0.5))
mlp.add(Dense(21, activation='sigmoid'))
mlp.add(Dropout(0.5))
mlp.add(Dense(21, activation='softmax'))


mlp.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [23]:
nb_epoch = 10      # number of epochs
batch_size = 32    # batch size
history = mlp.fit(train_data, train_y,
                    batch_size=batch_size,
                    epochs=nb_epoch,
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
import numpy as np # for array operations
from keras.models import Model, Sequential # for defining the architectures
from keras.layers import Dense, Dropout, Input # layers for building the network
from keras.utils import to_categorical # to_categorical does one-hot encoding


In [26]:
nb_epoch = 10      # number of epochs
batch_size = 32    # batch size

input_dat = Input(shape=(463,))
crrpt_dat = Dropout(0.5)(input_dat)
encoded = Dense(400, activation='sigmoid')(crrpt_dat)
decoded = Dense(21, activation='linear')(encoded)

autoencoder = Model(input_dat,decoded)
autoencoder.compile(optimizer='adam',
                    loss='mean_squared_error')



In [27]:
train_data.shape

(48667, 463)

In [42]:
! pip install --user cython

Collecting cython
  Downloading Cython-0.27.3-cp27-cp27mu-manylinux1_x86_64.whl (3.0MB)
[K    100% |████████████████████████████████| 3.0MB 234kB/s ta 0:00:01
[?25hInstalling collected packages: cython
Successfully installed cython-0.27.3


In [43]:
history = autoencoder.fit(train_data, train_y,  
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    )

import h5py
#autoencoder.save_weights('data_model.h5py', 'r') # save the model weights


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [44]:
encoder = Model(input_dat,encoded)
htrain_data = encoder.predict(train_data)

In [46]:
mlp = Sequential()
mlp.add(Dropout(0.2, input_shape=(400,)))
mlp.add(Dense(400, activation='sigmoid'))
mlp.add(Dropout(0.5))
mlp.add(Dense(21, activation='softmax'))



mlp.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

htest_data = encoder.predict(test_data)
history = mlp.fit(htrain_data[:20000], train_y[:20000],
                    batch_size=batch_size,
                    epochs=nb_epoch,
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
