In [253]:
import pandas as pd
import numpy as np

In [254]:
data = pd.read_csv('spacy.csv')

In [255]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),
                                                        s["Tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
getter = SentenceGetter(data)


In [256]:
sentences = getter.sentences

In [257]:
from math import nan

words = list(set(data["Word"].values))
n_words = len(words)
print(words)
tags = []
for tag in set(data["Tag"].values):
    if tag is nan or isinstance(tag, float):
        tags.append('unk')
    else:
        tags.append(tag)
n_tags = len(tags)

['Azithromycin', '500 mg', 'three', 'time', 'MG', 'sleeping', 'Atorvastatin', 'to', 'Prescribing', 'milligram', '2', 'once', 'Amoxycillin', 'for', 'three ', 'two', 'once ', 'one ', 'Metformin', 'would', 'I ', 'thrice ', 'weeks', 'twice', ' days', 'Combiflam', '1', '40', 'Tab', '850', 'two ', 'like', '250', 'Gelusil ', 'mg', '3', 'a', 'Lisinopril', 'day', 'ML', '.', 'days', 'prescribe', '5ml', 'Crocin', '800', '5', 'times', 'thrice', '10', 'Aspirin', 'for ', 'a week', 'I', 'month', 'Robitussin', 'liquid', 'before ', '7', ' times', 'a day']


In [258]:
from future.utils import iteritems

word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}
tag2idx

{'O': 0, 'DUR': 1, 'DOS': 2, 'STR': 3, 'MED': 4}

In [259]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

maxlen = max([len(s) for s in sentences])
X = [[word2idx[w[0]] for w in s] for s in sentences]
print(X)
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=n_words - 1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

[[53, 42, 25, 23, 36, 38, 13, 30, 41, 40], [0, 1, 17, 3, 13, 35, 41, 40], [20, 19, 31, 7, 42, 55, 43, 48, 36, 38, 13, 46, 24, 40], [8, 6, 27, 9, 10, 47, 51, 26, 54, 40], [50, 45, 11, 36, 38, 51, 22, 40], [53, 42, 44, 30, 47, 36, 38, 13, 2, 24, 40], [28, 12, 32, 34, 21, 60, 13, 52, 40], [8, 18, 29, 4, 16, 36, 38, 14, 22, 40], [20, 19, 31, 7, 42, 33, 56, 49, 39, 15, 59, 40], [28, 37, 11, 57, 5, 13, 58, 41, 40]]


In [260]:
from keras.models import Sequential
from keras_contrib.layers import CRF
model = Sequential()
model.add(Dense(65, input_shape=(65,)))
model.add(Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=65))
model.add(Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal())))
#model.add(LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal()))
#model.add(TimeDistributed(Dense(n_tags, activation="relu")))
model.add(CRF(n_tags))

IndentationError: unexpected indent (<ipython-input-260-5e5f91f090dc>, line 12)

In [261]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from keras_contrib.layers import CRF

input = Input(shape=(14,))
word_embedding_size = 150

# Embedding Layer
model = Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=14)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(n_tags, activation="relu"))(model)  

# CRF Layer
crf = CRF(n_tags)

out = crf(model)  # output
model = Model(input, out)


In [None]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

#Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])

model.summary()

model.fit(X, np.array(y), batch_size=256, epochs=100, validation_split=0.2, verbose=1)

In [277]:
s = 'I prescribe Combiflam twice a day for two days'
words = s.split()
maxlen = len(words)
word2idx = {w: i for i, w in enumerate(words)}
print(word2idx)
X = [[word2idx[w] for w in words]]
print(X)
X = pad_sequences(maxlen=14, sequences=X, padding="post",value=n_words - 1)
print(X)
y = model.predict(X)

{'I': 0, 'prescribe': 1, 'Combiflam': 2, 'twice': 3, 'a': 4, 'day': 5, 'for': 6, 'two': 7, 'days': 8}
[[0, 1, 2, 3, 4, 5, 6, 7, 8]]
[[ 0  1  2  3  4  5  6  7  8 60 60 60 60 60]]


In [278]:
np.delete(y, slice(len(words), 14), 1)

array([[[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.]]], dtype=float32)

In [279]:
y==y2

array([[[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True]]])

In [280]:
[print(w) for w in words]

I
prescribe
Combiflam
twice
a
day
for
two
days


[None, None, None, None, None, None, None, None, None]

In [281]:
l = y.tolist()
g = [ent for ent in l[0]]

In [282]:
d = dict()
count =0 
for i in g:
    if([1]==1):
        d['DURATION'] = words[count]
    if(i[2]==1):
        d['DOSAGE'] = words[count]
    if(i[3]==1):
        d['STRENGTH'] = words[count]
    if(i[4]==1):
        d['MEDICINE'] = words[count]
    count+=1
        

In [283]:
tag2idx

{'O': 0, 'DUR': 1, 'DOS': 2, 'STR': 3, 'MED': 4}

In [285]:
d

{'DOSAGE': 'Combiflam'}