In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import csv
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

Using TensorFlow backend.


In [2]:
vocab_size = 3000
embedding_dim = 128
max_length = 150
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [3]:
df = pd.read_csv('cleaned.csv')

In [4]:
df = df.sample(frac=1)
X = df.keywords.values
y = df.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

In [5]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
encoder = LabelEncoder()
encoder.fit(y)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [6]:
encoder.inverse_transform([0,1,2,3,4])

array(['gastroenterology', 'neurology', 'orthopedic', 'radiology',
       'urology'], dtype=object)

In [7]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train) #only train
word_index = tokenizer.word_index
dict(list(word_index.items()))

{'<OOV>': 1,
 'of': 2,
 'and': 3,
 'or': 4,
 'ct': 5,
 'to': 6,
 'sample': 7,
 'cervical': 8,
 'mri': 9,
 'ne': 10,
 'the': 11,
 'nerve': 12,
 'brain': 13,
 'spine': 14,
 'medical': 15,
 'transcription': 16,
 'reports': 17,
 'any': 18,
 'hernia': 19,
 'carpal': 20,
 'disc': 21,
 'lumbar': 22,
 'artery': 23,
 'anterior': 24,
 'fracture': 25,
 'disease': 26,
 'tunnel': 27,
 'carotid': 28,
 'bladder': 29,
 'ligament': 30,
 'joint': 31,
 'laparoscopic': 32,
 'abdomen': 33,
 'ultrasound': 34,
 'inguinal': 35,
 'colon': 36,
 'scan': 37,
 'pain': 38,
 'bilateral': 39,
 'knee': 40,
 'tendon': 41,
 'discectomy': 42,
 'for': 43,
 'type': 44,
 'is': 45,
 'syndrome': 46,
 'stenosis': 47,
 'removed': 48,
 'date': 49,
 'thesetranscribed': 50,
 'examples': 51,
 'are': 52,
 'provided': 53,
 'by': 54,
 'various': 55,
 'users': 56,
 'andare': 57,
 'reference': 58,
 'purpose': 59,
 'only': 60,
 'mthelpline': 61,
 'does': 62,
 'not': 63,
 'certify': 64,
 'accuracy': 65,
 'quality': 66,
 'reportsthese': 67

In [8]:
len(word_index)

3137

In [9]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [10]:
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
max_features = 5000
maxlen = 150
batch_size = 32
embedding_dims = 128
filters = 250
kernel_size = 3
hidden_dims = 500
epochs = 10

In [12]:
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=150))
#model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(5))
model.add(Activation('softmax'))
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 128)          640000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 148, 250)          96250     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               125500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 500)               0         
________________________________________________

In [13]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [14]:
model.fit(X_train_padded, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_split=0.1)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 735 samples, validate on 82 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x2584252cf28>

In [15]:
model.evaluate(X_test_padded, y_test)



[0.7631252809268673, 0.790243923664093]

In [16]:
y_pred = model.predict_classes(X_test_padded)

In [17]:
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7902439024390244


In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.77      0.80        43
           1       1.00      1.00      1.00        29
           2       0.72      0.83      0.78        60
           3       0.57      0.51      0.54        45
           4       0.96      0.96      0.96        28

    accuracy                           0.79       205
   macro avg       0.82      0.82      0.82       205
weighted avg       0.79      0.79      0.79       205



In [19]:
confusion_matrix(y_test, y_pred)

array([[33,  0,  2,  8,  0],
       [ 0, 29,  0,  0,  0],
       [ 0,  0, 50,  9,  1],
       [ 6,  0, 16, 23,  0],
       [ 0,  0,  1,  0, 27]], dtype=int64)

In [20]:
model.save('CNNNotes.h5')

In [21]:
from keras.models import load_model

In [22]:
predictor = load_model('CNNNotes.h5')

In [23]:
y_predict1 = predictor.predict_classes(X_test_padded)

In [24]:
print(classification_report(y_test, y_predict1))

              precision    recall  f1-score   support

           0       0.85      0.77      0.80        43
           1       1.00      1.00      1.00        29
           2       0.72      0.83      0.78        60
           3       0.57      0.51      0.54        45
           4       0.96      0.96      0.96        28

    accuracy                           0.79       205
   macro avg       0.82      0.82      0.82       205
weighted avg       0.79      0.79      0.79       205



In [45]:
from io import StringIO 
sample1 = StringIO("""keywords;
                    pyeloplasty, ureteral stent placement, nephrolithotomy, ureteropelvic junction obstruction, jackson-pratt drain, foley catheter, renal pelvis, kidney stones, monocryl sutures, pelvis, renal, ureteropelvic, sutures;
                    sebaceous cyst, prolene suture, incisional hernia, incisional, abscess, hernia, abdomen, omentum, excision, cyst;
                    origin of stalk, extensor retinaculum, wrist ganglion, incision, excision, dorsal, tourniquet, wrist, ganglion;
                    arteriovenous malformation, avm, brain ct, cerebral angiogram, headache, audiogram, carotid bruits, difficulty ambulating, hemorrhage, interventricular hemorrhage, migraine, tinnitus, vertigo, visual change, weakness, episode of vertigo, evaluation
                    """)

    #keywords labeled as 
    #1. urology(labeled as 4) 
    #2. gasternology(labeled as 0) 
    #3. orthopedic(labeled as 2)
    #4. neurology (labeled as 1)
    #first we need to preprocess the example. Store the input texts to a dataframe and preprocess

predictDF = pd.read_csv(sample1, sep =";")
predictDF

Unnamed: 0,keywords,Unnamed: 1
0,"pyeloplasty, ureteral sten...",
1,"sebaceous cyst, prolene su...",
2,"origin of stalk, extensor ...",
3,arteriovenous malformation...,


In [46]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
SPACE_RE = re.compile(' ')
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    #text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = SPACE_RE.sub(' ', text)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

predictDF['keywords'] = predictDF['keywords'].apply(clean_text)
predictDF

Unnamed: 0,keywords,Unnamed: 1
0,pyeloplasty ureteral sten...,
1,sebaceous cyst prolene su...,
2,origin of stalk extensor ...,
3,arteriovenous malformation...,


In [47]:
X_sample_seq = tokenizer.texts_to_sequences(predictDF.keywords.values)
X_sample_padded = pad_sequences(X_sample_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [48]:
y_predict = predictor.predict_classes(X_sample_padded)

In [49]:
y_predict

array([4, 0, 2, 3], dtype=int64)

In [50]:
#3 right 1 wrong(neurology)