In [37]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import csv
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

In [38]:
vocab_size = 1000
embedding_dim = 32
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [39]:
df = pd.read_csv('EP1.csv')

In [40]:
df = df.sample(frac=1)
X = df.prescription.values
y = df.specialty.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

In [41]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
encoder = LabelEncoder()
encoder.fit(y)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [42]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train) #only train
word_index = tokenizer.word_index
dict(list(word_index.items()))

{'<OOV>': 1,
 'hcl': 2,
 'sodium': 3,
 'er': 4,
 'tartrate': 5,
 'metoprolol': 6,
 'potassium': 7,
 'omeprazole': 8,
 'amlodipine': 9,
 'calcium': 10,
 'furosemide': 11,
 'gabapentin': 12,
 'besylate': 13,
 'lisinopril': 14,
 'succinate': 15,
 'zolpidem': 16,
 'pantoprazole': 17,
 'diltiazem': 18,
 'prednisone': 19,
 'chloride': 20,
 'clopidogrel': 21,
 'hydrocodoneacetaminophen': 22,
 'simvastatin': 23,
 'warfarin': 24,
 'atorvastatin': 25,
 'carvedilol': 26,
 'lorazepam': 27,
 'losartan': 28,
 'maleate': 29,
 'acetate': 30,
 'divalproex': 31,
 'hbr': 32,
 'alprazolam': 33,
 'hydrochlorothiazide': 34,
 'clonazepam': 35,
 'xl': 36,
 'sertraline': 37,
 'nexium': 38,
 'clonidine': 39,
 'fumarate': 40,
 'isosorbide': 41,
 'klorcon': 42,
 'atenolol': 43,
 'spironolactone': 44,
 'bupropion': 45,
 'levothyroxine': 46,
 'citalopram': 47,
 'hr': 48,
 'pravastatin': 49,
 'diovan': 50,
 'allopurinol': 51,
 'quetiapine': 52,
 'venlafaxine': 53,
 'escitalopram': 54,
 'oxalate': 55,
 'mesylate': 56

In [43]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [44]:
X_train_seq[500]

[173,
 15,
 379,
 2,
 35,
 161,
 65,
 187,
 256,
 63,
 197,
 317,
 22,
 187,
 210,
 97,
 2,
 419,
 195,
 63,
 4,
 77,
 71,
 12,
 150,
 143,
 227,
 133,
 69,
 2,
 52,
 40,
 21,
 316,
 66,
 2,
 31,
 3,
 4,
 37,
 2,
 99,
 4,
 186,
 85,
 257,
 131,
 2,
 58,
 213,
 74,
 2,
 68]

In [45]:
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [46]:
encoder.inverse_transform([0,1,2,3,4,5,6])

array(['cardiovasculardisease', 'gastroenterology', 'generalpractice',
       'hematologyoncology', 'nephrology', 'neurology', 'psychiatry'],
      dtype=object)

In [47]:
max_features = 5000
maxlen = 80
batch_size = 32
embedding_dims = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10

In [48]:
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=200))
#model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(7))
model.add(Activation('softmax'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 128)          640000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 198, 250)          96250     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 250)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_3 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_5 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 7)                

In [49]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [50]:
model.fit(X_train_padded, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_split=0.1)

Train on 1516 samples, validate on 169 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x13b2785ba58>

In [51]:
model.evaluate(X_test_padded, y_test)



[0.24354841042553038, 0.950236976146698]

In [52]:
y_pred = model.predict_classes(X_test_padded)

In [53]:
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9502369668246445


In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94        63
           1       0.98      0.91      0.94        56
           2       0.96      1.00      0.98        50
           3       1.00      0.97      0.98        60
           4       0.90      0.97      0.93        66
           5       0.97      0.90      0.93        69
           6       0.90      0.98      0.94        58

    accuracy                           0.95       422
   macro avg       0.95      0.95      0.95       422
weighted avg       0.95      0.95      0.95       422



In [55]:
confusion_matrix(y_test, y_pred)

array([[59,  0,  0,  0,  3,  0,  1],
       [ 1, 51,  1,  0,  2,  0,  1],
       [ 0,  0, 50,  0,  0,  0,  0],
       [ 0,  1,  0, 58,  0,  1,  0],
       [ 1,  0,  0,  0, 64,  1,  0],
       [ 1,  0,  0,  0,  2, 62,  4],
       [ 0,  0,  1,  0,  0,  0, 57]], dtype=int64)

In [56]:
model.save('CNNPrescription.h5')

In [57]:
from keras.models import load_model
predictor = load_model('CNNPrescription.h5')

In [58]:
y_predict1 = predictor.predict_classes(X_test_padded)

In [59]:
print(classification_report(y_test, y_predict1))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94        63
           1       0.98      0.91      0.94        56
           2       0.96      1.00      0.98        50
           3       1.00      0.97      0.98        60
           4       0.90      0.97      0.93        66
           5       0.97      0.90      0.93        69
           6       0.90      0.98      0.94        58

    accuracy                           0.95       422
   macro avg       0.95      0.95      0.95       422
weighted avg       0.95      0.95      0.95       422



In [60]:
from io import StringIO 
sample1 = StringIO("""prescription;
                    trilyte with flavor packets  gavilyten  lansoprazole  nexium  omeprazole  prevalite  asacol hd  pantoprazole sodium  ursodiol  spironolactone  azathioprine  dicyclomine hcl;
                    levofloxacin  letrozole  tamoxifen citrate  dexamethasone  exemestane  prochlorperazine maleate  warfarin sodium  anastrozole  hydrocodoneacetaminophen  potassium chloride  megestrol acetate  klorcon m  alprazolam  ondansetron hcl;
                    amoxicillin;
                    calcium acetate  bisoprololhydrochlorothiazide  allopurinol  potassium citrate  amlodipine besylate;
    
                    """)

    #keywords labeled as 
    #1. gasternology(labeled as 1) 
    #2. hematologyoncology(labeled as 3) 
    #3. generalpractice(labeled as 2)
    #4. nephrology (labeled as 4)
    #first we need to preprocess the example. Store the input texts to a dataframe and preprocess

predictDF = pd.read_csv(sample1, sep =";")


predictDF

Unnamed: 0,prescription,Unnamed: 1
0,trilyte with flavor packet...,
1,levofloxacin letrozole t...,
2,amoxicillin,
3,calcium acetate bisoprolo...,


In [61]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
SPACE_RE = re.compile(' ')
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    #text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = SPACE_RE.sub(' ', text)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

predictDF['prescription'] = predictDF['prescription'].apply(clean_text)
predictDF

Unnamed: 0,prescription,Unnamed: 1
0,trilyte with flavor packet...,
1,levofloxacin letrozole t...,
2,amoxicillin,
3,calcium acetate bisoprolo...,


In [62]:
X_sample_seq = tokenizer.texts_to_sequences(predictDF.prescription.values)
X_sample_padded = pad_sequences(X_sample_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [63]:
y_predict = predictor.predict_classes(X_sample_padded)

In [64]:
y_predict

array([1, 3, 2, 4], dtype=int64)

In [None]:
#4 right 0 wrong