In [None]:
cd knowledge-aware-med-classification

/content/knowledge-aware-med-classification


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano'
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
%matplotlib inline

In [None]:
# Downloading Glove Embeddings
!curl -O -J -L http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   308    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   345    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  822M  100  822M    0     0  5222k      0  0:02:41  0:02:41 --:--:-- 5372k
Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
df_train = pd.read_csv('data/ichi_dataset/final_train_result.tsv',sep='\t')

In [None]:
df_train.drop(['Title','Concepts'],axis=1,inplace=True)
df_train

Unnamed: 0,Category,Question
0,SOCL,"I have a lump on my wrist, right below the rig..."
1,PREG,I am 12w1d pg with twins and for about the pas...
2,GOAL,Please I need help real quick I have done an m...
3,SOCL,http://www.msnbc.msn.com/id/40820892/ns/techno...
4,TRMT,"Hey Everyone, :)I'm too busy to wait around fo..."
...,...,...
7995,GOAL,Hi :)Just this morning I woke up with blurred ...
7996,FAML,"We gave our 7 years old a journal, as one of h..."
7997,PREG,"My 3.5 yr son does not listen at home, he is a..."
7998,DISE,I think the amount billed to my insurance is r...


In [None]:
df_test = pd.read_csv('data/ichi_dataset/final_test_result.tsv',sep='\t')
df_test

Unnamed: 0,Category,Title,Question,Concepts
0,DISE,Interrupt TX ???,Hi All! I am new here but have been lurking fo...,
1,SOCL,Swollen vagina clit,My girlfriend and i just got through having se...,swelling
2,GOAL,burning yellow eyes,Dr. i have dirty yellow buning eyes since my t...,eyes|condition|buning eyes|age|HAND
3,SOCL,Drug test and ws,"Hi, a few nights ago I went to a gay sexclub a...",
4,FAML,4 year old is out of control,my 4 year old is a nightmare. me and my husban...,screaming|all|demanding|fits|nightmare|age|old
...,...,...,...,...
2995,TRMT,lap band removal is scheduled,I am definitley having my lap band removed in ...,weight loss|all|weight|acid reflux|out|said
2996,PREG,I had my IUI today,I am in the TWW again. I was on Femara this mo...,follicles
2997,DISE,daily routine charts?,I've been advised to try visual routine charts...,
2998,PREG,anyone starting shots for an IUI?,Just looking for some cycle buddies! I star...,


In [None]:
df_test.drop(['Title','Concepts'],axis=1,inplace=True)
df_test

Unnamed: 0,Category,Question
0,DISE,Hi All! I am new here but have been lurking fo...
1,SOCL,My girlfriend and i just got through having se...
2,GOAL,Dr. i have dirty yellow buning eyes since my t...
3,SOCL,"Hi, a few nights ago I went to a gay sexclub a..."
4,FAML,my 4 year old is a nightmare. me and my husban...
...,...,...
2995,TRMT,I am definitley having my lap band removed in ...
2996,PREG,I am in the TWW again. I was on Femara this mo...
2997,DISE,I've been advised to try visual routine charts...
2998,PREG,Just looking for some cycle buddies! I star...


In [None]:
# Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
# Label Categories in the dataset
categories = ['DEMO','DISE','FAML','GOAL','PREG','SOCL','TRMT']

In [None]:
from nltk import tokenize

In [None]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [None]:
# Assiging integer ID to each category
macronum=sorted(set(df_train['Category']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

In [None]:
macro_to_id

{'DEMO': 0, 'DISE': 1, 'FAML': 2, 'GOAL': 3, 'PREG': 4, 'SOCL': 5, 'TRMT': 6}

In [None]:
#Function to return id of a category
def fun(i):
    return macro_to_id[i]

df_train['Category']=df_train['Category'].apply(fun)

In [None]:
sentence = [] #sent_tokenized train text
labels = [] #train labels
texts = [] #train texts

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Creating text and sent_tokenized text array
for i in range(df_train.Question.shape[0]):
  text = BeautifulSoup(df_train.Question[i])
  text=clean_text(str(text.get_text().encode()).lower())
  texts.append(text)
  sentences = tokenize.sent_tokenize(text)
  sentence.append(sentences)

# Creating Label array
for i in df_train['Category']:
    labels.append(i)

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="<UKN>")
tokenizer.fit_on_texts(texts)
# Creating an array data containing tokenized values of words in the format (text,sentence,word)
data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(sentence):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            wordSeq = tokenizer.texts_to_sequences(wordTokens)
            for _, word in enumerate(wordSeq):
                if k<MAX_SENT_LENGTH and word[0]<MAX_NB_WORDS:
                    data[i,j,k] = word[0]
                    k=k+1

In [None]:
word_index = tokenizer.word_index
print('No. of %s unique tokens.' % len(word_index))

No. of 37221 unique tokens.


In [None]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

Shape of data tensor: (8000, 15, 100)
Shape of label tensor: (8000, 7)


In [None]:
df_test['Category']=df_test['Category'].apply(fun)

In [None]:
sentence_val = [] #sent_tokenized test text
labels_val = [] #test labels
texts_val = [] #test texts

In [None]:
# Creating text and sent_tokenized text array
for i in range(df_test.Question.shape[0]):
  text_val = BeautifulSoup(df_test.Question[i])
  text_val=clean_text(str(text_val.get_text().encode()).lower())
  texts_val.append(text_val)
  sentences = tokenize.sent_tokenize(text_val)
  sentence_val.append(sentences)

# Creating Label array
for i in df_test['Category']:
    labels_val.append(i)

In [None]:
data_val = np.zeros((len(texts_val), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
# Creating an array data containing tokenized values of words in the format (text,sentence,word)
for i, sentences in enumerate(sentence_val):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            wordSeq = tokenizer.texts_to_sequences(wordTokens)
            for _, word in enumerate(wordSeq):
                if k<MAX_SENT_LENGTH and word[0]<MAX_NB_WORDS:
                    data_val[i,j,k] = word[0]
                    k=k+1

In [None]:
labels_val = to_categorical(np.asarray(labels_val))
print('Shape of data tensor:', data_val.shape)
print('Shape of label tensor:', labels_val.shape)


indices = np.arange(data_val.shape[0])
np.random.shuffle(indices)
data_val = data_val[indices]
labels_val = labels_val[indices]

Shape of data tensor: (3000, 15, 100)
Shape of label tensor: (3000, 7)


In [None]:
x_train = data
y_train = labels
x_val = data_val
y_val = labels_val

In [None]:
# Creating the Embedding Dictionary
embeddings_index = {}
f = open('glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [None]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
# Defining the embedding Layer
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

In [None]:
# Defining the Model
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)

review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
preds = Dense(len(macronum), activation='softmax')(l_lstm_sent)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Hierachical LSTM")
model.summary()

Hierachical LSTM
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 15, 100)]         0         
_________________________________________________________________
time_distributed (TimeDistri (None, 15, 200)           3883000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               240800    
_________________________________________________________________
dense (Dense)                (None, 7)                 1407      
Total params: 4,125,207
Trainable params: 4,125,207
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training the Model
cp=ModelCheckpoint('model_han_.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=10, batch_size=2,callbacks=[cp])

Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.54167, saving model to model_han_.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.54167 to 0.59167, saving model to model_han_.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.59167 to 0.61600, saving model to model_han_.hdf5
Epoch 4/10

Epoch 00004: val_acc improved from 0.61600 to 0.61900, saving model to model_han_.hdf5
Epoch 5/10

Epoch 00005: val_acc improved from 0.61900 to 0.62033, saving model to model_han_.hdf5
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.62033
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.62033
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.62033
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.62033
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.62033
