In [1]:
## Import Libraries 
import numpy as np      # for array operations
import pandas as pd     # for reading data operations

from keras.preprocessing.text import Tokenizer          # for tokenizing text
from keras.preprocessing.sequence import pad_sequences  # for padding sentences with zeros. To make the sentence length same
from keras.utils import to_categorical                  # for one-hot encoding of the labels
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
## Define the sequence lengths, max number of words and embedding dimensions
MAX_SEQUENCE_LENGTH = 463   # Sequence length of each sentence. If more, crop. If less, pad with zeros
MAX_NB_WORDS = 20000        # Top 20000 frequently occuring words

In [3]:
##Loading data using Pandas
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(train.head(6))

              categories                                           converse
0  QUERIES FROM PHARMACY  please to verify instructions for drugname pat...
1        NEW APPOINTMENT  lmovm for patients mother to and schd rov trac...
2                 OTHERS  labtype and insurance approval other incoming ...
3                 OTHERS  clinical list changes medfusion secure electro...
4     MEDICATION RELATED  wants to wean off medication work phone name d...
5     MEDICATION RELATED  patient declining following up name patient ap...


In [4]:
test.head()

Unnamed: 0,converse
0,request to speak with rn no given patients ref...
1,patients husband ret name spouse for other pat...
2,fyi in his szs mom other he has an appointment...
3,Rx refill drugname from pharmacy name reason f...
4,need more time for testing dad is requesting a...


In [5]:
train.categories.unique()

array(['QUERIES FROM PHARMACY', 'NEW APPOINTMENT', 'OTHERS',
       'MEDICATION RELATED',
       'SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)', 'REFILL',
       'PRIOR AUTHORIZATION', 'RESCHEDULING', 'SYMPTOMS', 'LAB RESULTS',
       'FOLLOW UP ON PREVIOUS REQUEST', 'PROVIDER', 'CHANGE OF PROVIDER',
       'SHARING OF LAB RECORDS (FAX, E-MAIL, ETC.)',
       'QUERY ON CURRENT APPOINTMENT', 'RUNNING LATE TO APPOINTMENT',
       'CANCELLATION', 'CHANGE OF PHARMACY', 'QUERIES FROM INSURANCE FIRM',
       'JUNK', 'CHANGE OF HOSPITAL'], dtype=object)

In [6]:
train_labels = train['categories']
#test_labels = test['categories']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()                  # converts the character array to numeric array. Assigns levels to unique labels.
le.fit(train_labels)
train_labels = le.transform(train_labels)



In [7]:
train.converse.isnull().any()

True

In [8]:
train.dropna(axis=1, how='all',inplace=True)

In [9]:
train.converse.isnull().sum()

32

In [10]:
train.dropna(axis=1, how='all',inplace=True)

In [11]:
train['converse'].fillna(' ',inplace=True)

In [12]:
train.converse.isnull().sum()

0

In [13]:
list_sentences_train = train["converse"].fillna("_na_").values

y = train_labels
list_sentences_test = test["converse"].fillna("_na_").values

In [14]:
list_sentences_train

array([ 'please to verify instructions for drugname patients wi from pharmacy target reason for call details please to verify instructions for drugname patients will be picking up Rx timephrase clarified Rx with pharmacy lisa gilligan rn',
       'lmovm for patients mother to and schd rov tracy dominguez am kellee currie pm rhonda fanning',
       'labtype and insurance approval other incoming name lpn clinical review for cigna reason for call details to inform that there is not enough information provided but patients can be approved for an at home labtype whitney will fax this will need to be filled out and sent if there are any questions please whintey at rna follow routed to wendy upchurch lisa mohamed rn auth obtained and routed to sleep schedulers tammy byrd fyi wendy upchurch clinical list changes',
       ...,
       'Rx request aricept mg rxrf medfusion secure electronic message received from the medfusion web portal tuesday subject Rx for patsy macon patsy macon is ready for 

In [15]:
from keras.utils import to_categorical
y=to_categorical(y)

In [16]:
y.shape

(48699, 21)

In [17]:
(~y.any(axis=0)).any()

False

In [18]:
#np.where(~y.any(axis=0))[0]

In [42]:
y=pd.DataFrame(y)

In [19]:
#del y[0]

In [20]:
y.shape

(48699, 21)

In [21]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)   # get the frequently occuring words
tokenizer.fit_on_texts(list_sentences_train)           
train_sequences = tokenizer.texts_to_sequences(list_sentences_train)
test_sequences = tokenizer.texts_to_sequences(list_sentences_test)

word_index = tokenizer.word_index               # dictionary containing words and their index
# print(tokenizer.word_index)                   # print to check
print('Found %s unique tokens.' % len(word_index)) # total words in the corpus
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH) # get only the top frequent words on train
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)   # get only the top frequent words on test

print(train_data.shape)
print(test_data.shape)

Found 35747 unique tokens.
(48699, 463)
(8581, 463)


In [22]:
print word_index



In [24]:
labels_train = y


labels_train.shape


(48699, 21)

In [50]:
!pip install --user sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz
Collecting scikit-learn (from sklearn)
  Downloading scikit_learn-0.19.1-cp27-cp27mu-manylinux1_x86_64.whl (12.2MB)
[K    100% |████████████████████████████████| 12.2MB 87kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: sklearn
  Running setup.py bdist_wheel for sklearn ... [?25ldone
[?25h  Stored in directory: /home/B34usr6/.cache/pip/wheels/d7/db/a3/1b8041ab0be63b5c96c503df8e757cf205c2848cf9ef55f85e
Successfully built sklearn
Installing collected packages: scikit-learn, sklearn
Successfully installed scikit-learn-0.19.1 sklearn-0.0


In [25]:
#labels_train = to_categorical(np.asarray(train_labels))
#labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
#print('Shape of label tensor:', labels_test.shape)

('Shape of data tensor:', (48699, 463))
('Shape of label tensor:', (48699, 21))


In [26]:
EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)

463


In [27]:
print('Training model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
                    EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH
                    ))
model.add(Dropout(0.5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(21, activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])




Training model.


In [64]:
model.fit(np.array(train_data), np.array(labels_train), batch_size=64,epochs=20,validation_split=0.2);


Train on 38959 samples, validate on 9740 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [69]:
#model.evaluate(test_data, labels_test)
test_labels=model.predict_classes(test_data)



In [70]:
test_pred_classes = le.inverse_transform(test_labels)

In [71]:
df_test_pred_classes = pd.DataFrame(test_pred_classes)
df_test_pred_classes.to_csv("goam.csv")