In [17]:
import os

#%matplotlib inline
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

In [18]:
df = pd.read_csv('cleaned.csv')
df.head()

Unnamed: 0,keywords,label
0,sterilization vas fertile male bilateral ...,urology
1,prostate cancer technetium whole body uri...,urology
2,vasectomy allis clamp catgut hemoclips i...,urology
3,hemiscrotum bilateral vasectomy voluntary ...,urology
4,scrotal incision right vas bleeding anest...,urology


In [19]:
X = df.keywords.values
Y = df.label.values

In [20]:
from sklearn.model_selection import train_test_split
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 50)

In [21]:
max_words = 3500
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(X_train)

In [22]:
X_train = tokenize.texts_to_matrix(X_train)
x_test = tokenize.texts_to_matrix(x_test)

In [23]:
encoder = LabelEncoder()
encoder.fit(Y)

LabelEncoder()

In [24]:
Y = encoder.transform(Y)

In [25]:
Y_train = encoder.transform(Y_train)
y_test = encoder.transform(y_test)

In [26]:
encoder.inverse_transform([0,1,2,3,4])

array(['gastroenterology', 'neurology', 'orthopedic', 'radiology',
       'urology'], dtype=object)

In [27]:
batch_size = 32
epochs = 10

In [28]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation('softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               1792512   
_________________________________________________________________
activation_3 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 2565      
_________________________________________________________________
activation_4 (Activation)    (None, 5)                 0         
Total params: 1,795,077
Trainable params: 1,795,077
Non-trainable params: 0
_________________________________________________________________


In [29]:
import keras
#import keras_metrics
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                   shuffle = True)

Train on 735 samples, validate on 82 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
_, test_acc = model.evaluate(x_test, y_test)
test_acc



0.7658536434173584

In [31]:
y_pred = model.predict_classes(x_test)

In [17]:
y_test

array([2, 1, 4, 2, 0, 2, 0, 4, 0, 4, 2, 2, 3, 3, 2, 0, 2, 0, 3, 2, 3, 0,
       2, 1, 2, 2, 2, 3, 3, 2, 0, 3, 3, 3, 2, 2, 2, 1, 1, 2, 3, 0, 0, 0,
       0, 3, 3, 2, 3, 1, 2, 3, 4, 3, 1, 3, 1, 2, 2, 2, 4, 0, 1, 3, 3, 3,
       1, 2, 2, 2, 2, 4, 0, 2, 1, 0, 2, 0, 3, 2, 4, 3, 4, 0, 0, 1, 2, 4,
       2, 3, 2, 1, 2, 1, 4, 3, 2, 2, 3, 3, 4, 1, 0, 3, 2, 2, 2, 1, 0, 1,
       2, 2, 0, 3, 2, 3, 0, 2, 4, 0, 2, 3, 3, 2, 1, 1, 2, 2, 1, 0, 0, 2,
       0, 1, 3, 0, 1, 0, 1, 2, 2, 4, 4, 4, 0, 3, 4, 4, 2, 4, 3, 2, 3, 0,
       2, 4, 4, 2, 2, 1, 0, 1, 2, 0, 4, 1, 0, 2, 3, 3, 0, 4, 1, 0, 0, 2,
       4, 3, 2, 2, 1, 3, 0, 4, 3, 0, 3, 3, 2, 4, 1, 0, 2, 3, 4, 0, 4, 3,
       2, 3, 3, 0, 1, 2, 2])

In [32]:
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7658536585365854


In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86        40
           1       0.69      0.83      0.75        29
           2       0.82      0.80      0.81        64
           3       0.54      0.49      0.51        45
           4       0.96      0.93      0.94        27

    accuracy                           0.77       205
   macro avg       0.77      0.78      0.78       205
weighted avg       0.76      0.77      0.76       205



In [34]:
confusion_matrix(y_test, y_pred)

array([[35,  0,  0,  4,  1],
       [ 0, 24,  2,  3,  0],
       [ 0,  2, 51, 11,  0],
       [ 6,  9,  8, 22,  0],
       [ 0,  0,  1,  1, 25]], dtype=int64)

In [37]:
model.save('MLPNotes.h5')

In [39]:
from keras.models import load_model
predictor = load_model('MLPNotes.h5')

In [49]:
from io import StringIO 
sample1 = StringIO("""keywords;
                    pyeloplasty, ureteral stent placement, nephrolithotomy, ureteropelvic junction obstruction, jackson-pratt drain, foley catheter, renal pelvis, kidney stones, monocryl sutures, pelvis, renal, ureteropelvic, sutures;
                    fascial defect, umbilical hernia repair, curvilinear umbilical, hernia sac, metzenbaum scissors, umbilical hernia, bovie electrocautery, electrocautery, hernia, incision, umbilical;
                    origin of stalk, extensor retinaculum, wrist ganglion, incision, excision, dorsal, tourniquet, wrist, ganglion;
                    referential electrodes, scalp, hyperventilation, photic stimulation, electroencephalogram;
                    """)

    #keywords labeled as 
    #1. urology(labeled as 4) 
    #2. gasternology(labeled as 0) 
    #3. orthopedic(labeled as 2)
    #4. neurology (labeled as 1)
    #first we need to preprocess the example. Store the input texts to a dataframe and preprocess

predictDF = pd.read_csv(sample1, sep =";")
predictDF

Unnamed: 0,keywords,Unnamed: 1
0,"pyeloplasty, ureteral sten...",
1,"fascial defect, umbilical ...",
2,"origin of stalk, extensor ...",
3,"referential electrodes, sc...",


In [50]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
SPACE_RE = re.compile(' ')
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    #text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = SPACE_RE.sub(' ', text)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

predictDF['keywords'] = predictDF['keywords'].apply(clean_text)
predictDF

Unnamed: 0,keywords,Unnamed: 1
0,pyeloplasty ureteral sten...,
1,fascial defect umbilical ...,
2,origin of stalk extensor ...,
3,referential electrodes sc...,


In [51]:
X_sample1 = tokenize.texts_to_matrix(predictDF.keywords)
X_sample1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [52]:
y_predict = predictor.predict_classes(X_sample1)
y_predict

array([4, 4, 2, 1], dtype=int64)

In [None]:
#3 right 1 wrong(gastreonology)

In [43]:
y_predict1 = predictor.predict_classes(x_test)

In [44]:
print(classification_report(y_test, y_predict1))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86        40
           1       0.69      0.83      0.75        29
           2       0.82      0.80      0.81        64
           3       0.54      0.49      0.51        45
           4       0.96      0.93      0.94        27

    accuracy                           0.77       205
   macro avg       0.77      0.78      0.78       205
weighted avg       0.76      0.77      0.76       205



In [None]:
import matplotlib.pyplot as plt
 
# Create bars
barWidth = 2
bars1 = [0.86, 1, 0.67, 0.65, 0.80]
bars2 = [0.76, 0.96, 0.81, 0.57 ,0.80 ]
bars3 = [0.81 , 0.98, 0.73, 0.61, 0.80]
bars4 = bars1 + bars2 + bars3
 
# The X position of bars
r1 = [1,8,15, 22, 29]
r2 = [3,10,17, 24, 31]
r3 = [5,12,19,26,33]
r4 = r1 + r2 + r3
 
plt.figure(figsize=(9, 5))
# Create barplot
plt.bar(r1, bars1, width = barWidth, color = (0.3,0.1,0.4,0.6), label='Precision')
plt.bar(r2, bars2, width = barWidth, color = (0.3,0.5,0.4,0.6), label='Recall')
plt.bar(r3, bars3, width = barWidth, color = (0.3,0.9,0.4,0.6), label='F1-score')
# Note: the barplot could be created easily. See the barplot section for other examples.
 
# Create legend
plt.legend()
plt.xlabel('Medical specialty', fontsize=10)
plt.ylabel('precision/recall/f1', fontsize=10)
 
# Text below each barplot with a rotation at 90°
#plt.xticks([r + barWidth for r in range(len(r4)/3)], ['DD', 'with himself', 'with DC', 'with Silur', 'DC'], rotation=90)
 
# Create labels
labels = bars4
 
# Text on the top of each barplot
for i in range(len(r4)):
    plt.text(x = r4[i]-0.5 , y = bars4[i]+0.02, s = labels[i], size = 10, rotation = 90)
 
# Adjust the margins
plt.subplots_adjust(bottom=0.25, top = 1.4)
#X = np.arange(15)
plt.xticks([2.75, 9.75, 16.75, 23.75, 30.75], label, fontsize=10, rotation=45)
# Show graphic
plt.show()