In [None]:
cd knowledge-aware-med-classification

/content/knowledge-aware-med-classification


In [None]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
%matplotlib inline

In [None]:
# Downloading Glove Embeddings
!curl -O -J -L http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0   308    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   345    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  822M  100  822M    0     0  5265k      0  0:02:39  0:02:39 --:--:-- 5354k
Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
df_train = pd.read_csv('data/ichi_dataset/final_train_result.tsv',sep='\t')

In [None]:
df_train.drop(['Title'],axis=1,inplace=True)
df_train

Unnamed: 0,Category,Question,Concepts
0,SOCL,"I have a lump on my wrist, right below the rig...",right thumb|right|wrist|pain
1,PREG,I am 12w1d pg with twins and for about the pas...,upper abdomen|ribs|out|spasms|uterus|right|bel...
2,GOAL,Please I need help real quick I have done an m...,eye muscles|sever|scan|weak|eyes|pain in eye|p...
3,SOCL,http://www.msnbc.msn.com/id/40820892/ns/techno...,"rights|trial|faces|""miscarriage"
4,TRMT,"Hey Everyone, :)I'm too busy to wait around fo...",dye|DYE|liquid diet|unpacking|all
...,...,...,...
7995,GOAL,Hi :)Just this morning I woke up with blurred ...,symptoms|internal bleeding|head|sickness|eyes|...
7996,FAML,"We gave our 7 years old a journal, as one of h...",old|back
7997,PREG,"My 3.5 yr son does not listen at home, he is a...",old|hand|out|sense
7998,DISE,I think the amount billed to my insurance is r...,


In [None]:
df_test = pd.read_csv('data/ichi_dataset/final_test_result.tsv',sep='\t')

In [None]:
df_test.drop(['Title'],axis=1,inplace=True)
df_test

Unnamed: 0,Category,Question,Concepts
0,DISE,Hi All! I am new here but have been lurking fo...,
1,SOCL,My girlfriend and i just got through having se...,swelling
2,GOAL,Dr. i have dirty yellow buning eyes since my t...,eyes|condition|buning eyes|age|HAND
3,SOCL,"Hi, a few nights ago I went to a gay sexclub a...",
4,FAML,my 4 year old is a nightmare. me and my husban...,screaming|all|demanding|fits|nightmare|age|old
...,...,...,...
2995,TRMT,I am definitley having my lap band removed in ...,weight loss|all|weight|acid reflux|out|said
2996,PREG,I am in the TWW again. I was on Femara this mo...,follicles
2997,DISE,I've been advised to try visual routine charts...,
2998,PREG,Just looking for some cycle buddies! I star...,


In [None]:
# Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
# Label Categories in the dataset
categories = ['DEMO','DISE','FAML','GOAL','PREG','SOCL','TRMT']

In [None]:
from nltk import tokenize

In [None]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [None]:
# Assiging integer ID to each category
macronum=sorted(set(df_train['Category']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

In [None]:
macro_to_id

{'DEMO': 0, 'DISE': 1, 'FAML': 2, 'GOAL': 3, 'PREG': 4, 'SOCL': 5, 'TRMT': 6}

In [None]:
#Function to return id of a category
def fun(i):
    return macro_to_id[i]

df_train['Category']=df_train['Category'].apply(fun)

In [None]:
labels = []
texts = []

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Creating text array
for i in range(df_train.Question.shape[0]):
  text = BeautifulSoup(df_train.Question[i])
  texts.append(clean_text(str(text.get_text().encode()).lower()))

# Creating Labels array
for i in df_train['Category']:
    labels.append(i)

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,oov_token="<UKN>") # Defining the tokenizer
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 37221


In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) #Padding the texts to Maximum Sequence Length

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

Shape of Data Tensor: (8000, 1000)
Shape of Label Tensor: (8000, 7)


In [None]:
df_test['Category']=df_test['Category'].apply(fun)

In [None]:
labels_val = []
texts_val = []

In [None]:
# Creating Validation text array
for i in range(df_test.Question.shape[0]):
  text_val = BeautifulSoup(df_test.Question[i])
  texts_val.append(clean_text(str(text_val.get_text().encode()).lower()))

# Creating Validation Labels Array
for i in df_test['Category']:
    labels_val.append(i)

In [None]:
sequences_val = tokenizer.texts_to_sequences(texts_val)

In [None]:
data_val = pad_sequences(sequences_val, maxlen=MAX_SEQUENCE_LENGTH) # Padding validation text to maximum sequence length

labels_val = to_categorical(np.asarray(labels_val))
print('Shape of Data Tensor:', data_val.shape)
print('Shape of Label Tensor:', labels_val.shape)

indices = np.arange(data_val.shape[0])
np.random.shuffle(indices)
data_val = data_val[indices]
labels_val = labels_val[indices]

Shape of Data Tensor: (3000, 1000)
Shape of Label Tensor: (3000, 7)


In [None]:
x_train = data
y_train = labels
x_val = data_val
y_val = labels_val

In [None]:
# Creating the Embedding Dictionary
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [None]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
# Defining the embedding Layer
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [None]:
# Defining the Model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(len(macronum), activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified convolutional neural network
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 100)         3722200   
_________________________________________________________________
conv1d (Conv1D)              (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 199, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)    

In [None]:
# Training the Model
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=15, batch_size=2,callbacks=[cp])

Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.48633, saving model to model_cnn.hdf5
Epoch 2/15

Epoch 00002: val_acc improved from 0.48633 to 0.52067, saving model to model_cnn.hdf5
Epoch 3/15

Epoch 00003: val_acc did not improve from 0.52067
Epoch 4/15

Epoch 00004: val_acc did not improve from 0.52067
Epoch 5/15

Epoch 00005: val_acc improved from 0.52067 to 0.54300, saving model to model_cnn.hdf5
Epoch 6/15

Epoch 00006: val_acc did not improve from 0.54300
Epoch 7/15

Epoch 00007: val_acc did not improve from 0.54300
Epoch 8/15

Epoch 00008: val_acc did not improve from 0.54300
Epoch 9/15

Epoch 00009: val_acc did not improve from 0.54300
Epoch 10/15

Epoch 00010: val_acc did not improve from 0.54300
Epoch 11/15

Epoch 00011: val_acc improved from 0.54300 to 0.54533, saving model to model_cnn.hdf5
Epoch 12/15

Epoch 00012: val_acc did not improve from 0.54533
Epoch 13/15

Epoch 00013: val_acc improved from 0.54533 to 0.55100, saving model to model_cnn.hdf5
Epoch 14/15

