In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

In [None]:
train_data_source = './drug.names.csv'

train_df = pd.read_csv(train_data_source, header=None)

# convert string to lower case
train_texts = train_df[1].values
train_texts = [s.lower() for s in train_texts]

In [None]:
# data cleaning to remove dosage amounts in medicine data
x=[]
for i in range(len(train_texts)):
  s=train_texts[i].split()
  x.append(s[0])
x=pd.DataFrame(x)
train_texts = x[0].values
train_texts

array(['omeprazole_cap', 'dressit', 'flaminal', ..., 'coloplast_assura',
       'slow-fe_tab', 'sure-amp_bupivac'], dtype=object)

In [None]:
# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz 0123456789,;.!?:'\"/\\|_@#%^&*~`+-=<>()[]{}"
char_len = len(alphabet)
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
char_dict['$'] = 0

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

# Convert string to index
train_sequences = tk.texts_to_sequences(train_texts)

# Padding
train_data = pad_sequences(train_sequences, maxlen=50, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')

In [None]:
# creating inverse dictionary for getting outputs
inverse_dict = {}
for i,char in enumerate(alphabet):
  inverse_dict[i+1] = char
inverse_dict[0] = '$'

In [None]:
# one hot encoding of singular char, a string and a position

def one_hot(val,len=char_len+1):
  temp_list = []
  for var in val:
    temp = np.zeros(len,dtype=int)
    temp[int(var)] = 1
    temp_list.append(temp)
  return temp_list

def one_hot_char(val,len=char_len+1):
  temp = np.zeros(len,dtype=int)
  temp[char_dict[val]] = 1
  return np.array(temp)

def one_hot_value(pos,len=char_len+1):
  temp = np.zeros(len,dtype=int)
  temp[int(pos)] = 1
  return np.array(temp)

In [None]:
# prepairing datasets for training
def data_prep(dataset,lookback):
  train_x = []
  train_y = []
  for i in range(0,dataset.shape[0]):
    for j in range(lookback-1,50):
      if(dataset[i,j] != 0):
        temp_list = one_hot(dataset[i,j-lookback+1:j+1])
        train_x.append(temp_list)
        temp_list2 = np.zeros(char_len+1,dtype = int)
        temp_list2[int(dataset[i,j+1])] = 1
        train_y.append(temp_list2)
      else:
        break;
  return np.array(train_x), np.array(train_y)

def data_prep2(dataset,lookback):
  train_x = []
  train_y = []
  for i in range(0,dataset.shape[0]):
    if(dataset[i,lookback-1]!= 0):
      temp_list = one_hot(dataset[i,:lookback])
      train_x.append(temp_list)
      temp_list2 = np.zeros(char_len+1,dtype = int)
      temp_list2[int(dataset[i,lookback])] = 1
      train_y.append(temp_list2)
  return np.array(train_x), np.array(train_y)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import TimeDistributed
from keras.layers import Embedding
import keras.models
from keras.layers.normalization import BatchNormalization

# Building Neural Network model
def build_model(n_shape,l1=100,l2=100,d1=0.4,d2=0.4,len=char_len+1):
  model = Sequential()
  model.add(LSTM(l1,input_shape= n_shape,return_sequences=True,activation='tanh'))
  model.add(Dropout(d1))
  model.add(LSTM(l2,return_sequences=False,activation='tanh'))
  model.add(Dropout(d2))
  model.add(Dense(1024, activation='relu'))
  model.add(Dense(256,activation='relu'))
  model.add(Dense(len, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

def model_fit(model,train_x,train_y,bs=32,ep=100,shf=False):
  hist = model.fit(train_x,train_y,batch_size = bs,epochs = ep,shuffle = shf)
  return hist

In [None]:
# Building models for input 3-8 characters
 models = []
train_x = []
train_y = []
for i in range(3,8):
  temp1, temp2 = data_prep2(train_data,i)
  train_x.append(temp1)
  train_y.append(temp2)
  models.append(build_model(n_shape = (train_x[i-3].shape[1],train_x[i-3].shape[2])))

temp1, temp2 = data_prep(train_data,8)
train_x.append(temp1)
train_y.append(temp2)
models.append(build_model(n_shape = (train_x[5].shape[1],train_x[5].shape[2])))


In [None]:
hist_3words = model_fit(models[0],train_x[0],train_y[0])
keras.models.save_model(models[0],"model_3word.h5")

In [None]:
hist_4words = model_fit(models[1],train_x[1],train_y[1])
keras.models.save_model(models[1],"model_4word.h5")

In [None]:
hist_5words = model_fit(models[2],train_x[2],train_y[2])
keras.models.save_model(models[2],"model_5word.h5")

In [None]:
hist_6words = model_fit(models[3],train_x[3],train_y[3])
keras.models.save_model(models[3],"model_6word.h5")

In [None]:
hist_7words = model_fit(models[4],train_x[4],train_y[4])
keras.models.save_model(models[4],"model_7word.h5")

In [None]:
hist_8words = model_fit(models[5],train_x[5],train_y[5])
keras.models.save_model(models[5],"model_8word.h5")

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 3, 100)            68000     
_________________________________________________________________
dropout_8 (Dropout)          (None, 3, 100)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_9 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 1024)              103424    
_________________________________________________________________
dense_13 (Dense)             (None, 256)               262400    
_________________________________________________________________
dense_14 (Dense)             (None, 69)               

In [None]:
# loading trained models (if any)

word_predict = []
word_predict.append(keras.models.load_model("model_3word.h5"))
word_predict.append(keras.models.load_model("model_4word.h5"))
word_predict.append(keras.models.load_model("model_5word.h5"))
word_predict.append(keras.models.load_model("model_6word.h5"))
word_predict.append(keras.models.load_model("model_7word.h5"))
word_predict.append(keras.models.load_model("model_8word.h5"))


In [None]:
# Creating fuctions to predict output

# Getting the raw charactrs and converting to one hot enchoded character numpy array
def Vectorize(word):
  word = word.lower()
  length = len(word)
  if(length<3):
    print("Enter more letters")
    return np.zeros((1,1))
  word_arr = []
  for i in word:
    word_arr.append(char_dict[i])
  word_vect = np.array(one_hot(word_arr))
  return np.reshape(word_vect,(1,word_vect.shape[0],word_vect.shape[1]))

def one_hot_output(vect):
  max = 0
  max_pos = -1
  eof = False
  for i,val in enumerate(np.reshape(vect,(vect.shape[1],))):
    if(val>0.5):
      if(i == 0):
        eof=True
      return one_hot_value(i),eof
    elif(max>val):
      max = val
      max_pos = i
    if(max_pos==0):
      eof = True
  return one_hot_value(max_pos),eof

# predicting next character while looping and checking for end of char or max word length
def word_pred(word_vect):
  if(word_vect.shape[1] == 1):
    return np.zeros((1,1))
  eof = False
  if(word_vect.shape[1]<8):
    len_ = word_vect.shape[1]
    for i in range(len_,8):
      next_word,eof = one_hot_output(word_predict[i-3].predict(word_vect))
      word_vect = np.append(word_vect,np.reshape(next_word,(1,1,char_len+1)),axis=1)
      if(eof):
        return word_vect
  while( (not eof) and (word_vect.shape[1]<50)):
    next_word,eof = one_hot_output(word_predict[5].predict(word_vect[:,-8:,:]))
    word_vect = np.append(word_vect,np.reshape(next_word,(1,1,char_len+1)),axis=1)
  return word_vect

# Convert one hot encoded word to readable characters
def deencode(one_vect):
  for i,val in enumerate(one_vect):
    if(val==1):
      return inverse_dict[i]

def decode(vect):
  if(vect.shape[1] == 1):
    return np.zeros((1,1))
  word = ""
  for i in range(vect.shape[1]):
    word += deencode(np.reshape(vect[:,i,:],(vect.shape[2])))
  return word

def Prediction(word):
  #Convert original text to Vector by one hot encoding
  word_vect = Vectorize(word)

  #Predict the Output Vector using Deep Learning Models
  output_vect = word_pred(word_vect)

  #Convert the Output Vector to Human Redable Word
  actual_word = decode(output_vect)
  
  return actual_word

In [None]:
Prediction("ran")

'ranitidine$'

In [None]:
Prediction("Inda")

'indapamide_liq$'

In [None]:
Prediction("Amil")

'amiloride$'

In [None]:
Prediction("Peri")

'perindopril$'

In [None]:
Prediction("mal")

'malarone_}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}'

In [None]:
# Multiple-output-prediction

def one_hot_output_n(vect,n_pred_left):
  
  eof = []

  for i in range(n_pred_left):
    eof.append(False)

  word_val = []
  rem = n_pred_left
  temp = np.reshape(vect,(vect.shape[1],))
  index_list = np.argsort(temp)
  index_list = index_list.tolist()
  index_list.reverse()
  first = temp[index_list[0]]

  for i in index_list:
    if(temp[i]>0.7):
      if(i == 0):
        eof[0] = True
      word_val.append(one_hot_value(i))
      return word_val,eof

    elif ((rem != 0) and (first - temp[i] <= .3)):
      word_val.append(one_hot_value(i))
      if(i == 0):
        eof[n_pred_left-rem] = True
      rem = rem-1

    else:
      break

  return word_val,eof

# get all possible character outcomes with 30% difference to original predicted character
def get_possib(word_vect,n_pred):

  t_list = []
  if(word_vect.shape[1] == 1):
    return np.zeros((1,1))
  first = True
  rem = 2
  if(word_vect.shape[1]<8):
    len_ = word_vect.shape[1]

    for i in range(len_,8):
      next_word_list,eof_list = one_hot_output_n(word_predict[i-3].predict(word_vect),2)

      if(first):
        t_list.append(np.append(word_vect,np.reshape(next_word_list[0],(1,1,char_len+1)),axis=1))
        first = False

      if(len(next_word_list)>1 and rem>0):
        t_list.append(np.append(word_vect,np.reshape(next_word_list[1],(1,1,char_len+1)),axis=1))
        rem = rem-1

      word_vect = np.append(word_vect,np.reshape(next_word_list[0],(1,1,char_len+1)),axis=1)
      t_list[0] = word_vect

      if(eof_list[0]):
        return t_list

  while( (not eof_list[0]) and (word_vect.shape[1]<50)):
    next_word_list,eof_list = one_hot_output_n(word_predict[5].predict(word_vect[:,-8:,:]),2)

    if(first):
      t_list.append(np.append(word_vect,np.reshape(next_word_list[0],(1,1,char_len+1)),axis=1))
      first = False

    if(len(next_word_list)>1 and rem>0):
      t_list.append(np.append(word_vect,np.reshape(next_word_list[1],(1,1,char_len+1)),axis=1))
      rem=rem-1

    word_vect = np.append(word_vect,np.reshape(next_word_list[0],(1,1,char_len+1)),axis=1)
    t_list[0] = word_vect

  return t_list


def word_pred_n(vect,n_pred):

  temp_list = get_possib(vect,n_pred)
  words = []
  words.append(temp_list[0])

  for i in range(1,len(temp_list)):
    words.append(word_pred(temp_list[i]))

  return words

def Prediction_n(word,n_pred=3):

  if(n_pred<1):
    return []

  # Convert original text to Vector by one hot encoding
  word_vect = Vectorize(word)

  # Predict the Output Vector using Deep Learning Models

  output_vect = word_pred_n(word_vect,n_pred)

  # Convert all the Output Vectors to Human Redable Words
  actual_words = []
  for i in range(len(output_vect)):
    actual_words.append(decode(output_vect[i]))
  
  return actual_words

In [None]:
Prediction_n('dicy')

['dicynene_inj$', 'dicycloverine$', 'dicynene_tab$']