In [1]:
import os

#%matplotlib inline
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [2]:
df = pd.read_csv('EP1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,prescription,specialty
0,3011,peg electrolyte trilyte with flavor packets ...,gastroenterology
1,554,omeprazole lorazepam xarelto exjade folic ...,hematologyoncology
2,20,amoxicillin,generalpractice
3,889,omeprazole colestipol hcl diphenoxylateatrop...,gastroenterology
4,874,bisoprolol fumarate diltiazem hr er crestor ...,cardiovasculardisease


In [3]:
df.specialty.unique()

array(['gastroenterology', 'hematologyoncology', 'generalpractice',
       'cardiovasculardisease', 'neurology', 'psychiatry', 'nephrology'],
      dtype=object)

In [4]:
X = df.prescription.values
Y = df.specialty.values

In [5]:
from sklearn.model_selection import train_test_split
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 50)

In [6]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(X_train)

In [7]:
tokenize.word_index

{'hcl': 1,
 'sodium': 2,
 'er': 3,
 'tartrate': 4,
 'metoprolol': 5,
 'potassium': 6,
 'omeprazole': 7,
 'amlodipine': 8,
 'calcium': 9,
 'gabapentin': 10,
 'furosemide': 11,
 'besylate': 12,
 'lisinopril': 13,
 'succinate': 14,
 'zolpidem': 15,
 'hydrocodoneacetaminophen': 16,
 'prednisone': 17,
 'diltiazem': 18,
 'chloride': 19,
 'pantoprazole': 20,
 'simvastatin': 21,
 'warfarin': 22,
 'clopidogrel': 23,
 'atorvastatin': 24,
 'lorazepam': 25,
 'divalproex': 26,
 'carvedilol': 27,
 'acetate': 28,
 'losartan': 29,
 'maleate': 30,
 'hbr': 31,
 'alprazolam': 32,
 'hydrochlorothiazide': 33,
 'clonazepam': 34,
 'xl': 35,
 'sertraline': 36,
 'clonidine': 37,
 'nexium': 38,
 'fumarate': 39,
 'klorcon': 40,
 'atenolol': 41,
 'bupropion': 42,
 'isosorbide': 43,
 'levothyroxine': 44,
 'citalopram': 45,
 'venlafaxine': 46,
 'spironolactone': 47,
 'allopurinol': 48,
 'quetiapine': 49,
 'pravastatin': 50,
 'diovan': 51,
 'lamotrigine': 52,
 'hr': 53,
 'trazodone': 54,
 'mesylate': 55,
 'escitalop

In [8]:
X_train = tokenize.texts_to_matrix(X_train)
x_test = tokenize.texts_to_matrix(x_test)

In [9]:
encoder = LabelEncoder()
encoder.fit(Y)

LabelEncoder()

In [10]:
Y = encoder.transform(Y)

In [11]:
Y_train = encoder.transform(Y_train)
y_test = encoder.transform(y_test)

In [12]:
encoder.inverse_transform([0,1,2,3,4,5,6])

array(['cardiovasculardisease', 'gastroenterology', 'generalpractice',
       'hematologyoncology', 'nephrology', 'neurology', 'psychiatry'],
      dtype=object)

In [13]:
batch_size = 32
epochs = 10

In [14]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(7))
model.add(Activation('softmax'))

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               512512    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 3591      
_________________________________________________________________
activation_2 (Activation)    (None, 7)                 0         
Total params: 516,103
Trainable params: 516,103
Non-trainable params: 0
_________________________________________________________________


In [15]:
import keras
#import keras_metrics
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                   shuffle = True)

Train on 1516 samples, validate on 169 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
_, test_acc = model.evaluate(x_test, y_test)
test_acc



0.9478672742843628

In [17]:
y_pred = model.predict_classes(x_test)

In [18]:
y_test

array([3, 0, 0, 4, 0, 4, 5, 3, 0, 1, 3, 1, 3, 1, 1, 0, 1, 6, 1, 3, 6, 3,
       1, 1, 0, 6, 5, 5, 1, 0, 1, 2, 2, 5, 2, 6, 1, 2, 6, 6, 3, 0, 0, 4,
       5, 2, 5, 2, 1, 4, 1, 5, 1, 2, 2, 3, 6, 5, 6, 0, 2, 2, 0, 3, 0, 1,
       5, 1, 0, 0, 4, 1, 2, 0, 5, 6, 5, 0, 0, 0, 1, 0, 5, 1, 6, 3, 6, 6,
       5, 4, 1, 4, 4, 6, 0, 4, 5, 2, 5, 1, 1, 4, 3, 4, 0, 2, 6, 1, 1, 0,
       6, 4, 2, 3, 0, 4, 6, 4, 1, 0, 3, 5, 4, 3, 6, 5, 0, 0, 1, 0, 2, 5,
       2, 2, 1, 5, 2, 4, 0, 2, 5, 4, 4, 2, 5, 3, 0, 2, 6, 5, 4, 6, 6, 2,
       0, 6, 4, 2, 1, 5, 5, 5, 5, 1, 1, 5, 5, 5, 3, 6, 0, 1, 2, 1, 4, 5,
       1, 0, 1, 0, 3, 4, 4, 2, 3, 4, 1, 6, 3, 4, 6, 3, 1, 1, 0, 1, 0, 6,
       0, 1, 5, 0, 1, 4, 3, 3, 0, 0, 6, 6, 2, 1, 1, 4, 0, 5, 4, 6, 1, 5,
       1, 6, 1, 4, 5, 4, 6, 3, 5, 0, 5, 1, 1, 1, 1, 4, 2, 3, 4, 3, 3, 1,
       3, 5, 2, 0, 4, 5, 4, 5, 6, 1, 0, 3, 5, 2, 3, 5, 1, 1, 6, 2, 0, 3,
       0, 2, 3, 3, 0, 1, 0, 6, 6, 0, 2, 1, 5, 3, 2, 6, 1, 1, 4, 4, 4, 4,
       1, 1, 2, 2, 6, 2, 0, 4, 1, 0, 0, 3, 3, 3, 6,

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
precision_score(y_test, y_pred, average = 'macro')

0.9480285037529975

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94        72
           1       0.99      0.96      0.97        75
           2       0.98      0.96      0.97        49
           3       0.92      0.96      0.94        48
           4       0.96      0.92      0.94        60
           5       0.92      0.93      0.92        59
           6       0.95      0.95      0.95        59

    accuracy                           0.95       422
   macro avg       0.95      0.95      0.95       422
weighted avg       0.95      0.95      0.95       422



In [21]:
confusion_matrix(y_test, y_pred)

array([[69,  0,  0,  1,  1,  1,  0],
       [ 2, 72,  1,  0,  0,  0,  0],
       [ 0,  1, 47,  0,  0,  0,  1],
       [ 0,  0,  0, 46,  1,  1,  0],
       [ 2,  0,  0,  1, 55,  2,  0],
       [ 1,  0,  0,  1,  0, 55,  2],
       [ 1,  0,  0,  1,  0,  1, 56]], dtype=int64)

In [22]:
model.save('MLPPrescription.h5')

In [24]:
from keras.models import load_model
predictor = load_model('MLPPrescription.h5')

In [25]:
from io import StringIO 
sample1 = StringIO("""prescription;
                    trilyte with flavor packets  gavilyten  lansoprazole  nexium  omeprazole  prevalite  asacol hd  pantoprazole sodium  ursodiol  spironolactone  azathioprine  dicyclomine hcl;
                    levofloxacin  letrozole  tamoxifen citrate  dexamethasone  exemestane  prochlorperazine maleate  warfarin sodium  anastrozole  hydrocodoneacetaminophen  potassium chloride  megestrol acetate  klorcon m  alprazolam  ondansetron hcl;
                    amoxicillin;
                    calcium acetate  bisoprololhydrochlorothiazide  allopurinol  potassium citrate  amlodipine besylate;
    
                    """)

    #keywords labeled as 
    #1. gasternology(labeled as 1) 
    #2. hematologyoncology(labeled as 3) 
    #3. generalpractice(labeled as 2)
    #4. nephrology (labeled as 4)
    #first we need to preprocess the example. Store the input texts to a dataframe and preprocess

predictDF = pd.read_csv(sample1, sep =";")


predictDF

Unnamed: 0,prescription,Unnamed: 1
0,trilyte with flavor packet...,
1,levofloxacin letrozole t...,
2,amoxicillin,
3,calcium acetate bisoprolo...,


In [26]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
SPACE_RE = re.compile(' ')
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    #text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = SPACE_RE.sub(' ', text)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

predictDF['prescription'] = predictDF['prescription'].apply(clean_text)
predictDF

Unnamed: 0,prescription,Unnamed: 1
0,trilyte with flavor packet...,
1,levofloxacin letrozole t...,
2,amoxicillin,
3,calcium acetate bisoprolo...,


In [27]:
X_sample1 = tokenize.texts_to_matrix(predictDF.prescription)

In [29]:
y_sample1 = predictor.predict_classes(X_sample1)
y_sample1

array([1, 3, 2, 4], dtype=int64)

In [30]:
#4 right 0 wrong

In [34]:
y_predict = predictor.predict_classes(x_test)
print(classification_report(y_predict, y_test))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94        75
           1       0.96      0.99      0.97        73
           2       0.96      0.98      0.97        48
           3       0.96      0.92      0.94        50
           4       0.92      0.96      0.94        57
           5       0.93      0.92      0.92        60
           6       0.95      0.95      0.95        59

    accuracy                           0.95       422
   macro avg       0.95      0.95      0.95       422
weighted avg       0.95      0.95      0.95       422

