#### Process data

In [None]:
import pandas as pd
df = pd.read_csv("data/product.txt", sep="\t", encoding="unicode_escape")
#filter on single word in prescription category
df2 = df[(df['PROPRIETARYNAME'].str.count(' ') == 0) & (df['PRODUCTTYPENAME'] == 'HUMAN PRESCRIPTION DRUG')]
#convert to upper and remove any name with non-alpha chars
df2.loc[:, 'PROPRIETARYNAME'] = df2.loc[:, 'PROPRIETARYNAME'].str.upper().replace('[^A-Z]', '', regex=True)
#drop dups
unique_drugs = df2.drop_duplicates(subset=['PROPRIETARYNAME'])
#save
unique_drugs['PROPRIETARYNAME'].to_csv("data/drug_names.txt", header=False, index=False)

#### TF imports

In [1]:
import tensorflow as tf
from tensorflow.python.keras.optimizers import rmsprop_v2
from tensorflow.python.keras.callbacks import LambdaCallback
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import LSTM
import numpy as np

#### Reading data/prep vocab

In [None]:
with open("data/drug_names.txt", encoding='utf-8') as f:
    text = f.read().lower()

print('num chars: ', len(text))

In [None]:
vocab = sorted(list(set(text)))
print(len(vocab))
print(vocab)

lines = text.split('\n')
lines = [line for line in lines if len(line) != 0]
print("total lines: ", len(lines))
max_length = len(max(lines, key=len))

char_ind = dict((c, i) for i, c in enumerate(vocab))
ind_char = dict((i, c) for i, c in enumerate(vocab))

#### Prepare dataset

In [None]:
substrings = []
target_char = []

for line in lines:
    # pre-padding with zeros
    s = (max_length - len(line))*'0' + line
    substrings.append(s)
    target_char.append('\n')
    for it,j in enumerate(line):
        if (it >= len(line)-1):
            continue
        s = (max_length - len(line[:-1-it]))*'0' + line[:-1-it]
        substrings.append(s)
        target_char.append(line[-1-it])

print(len(substrings))

In [None]:
X = np.zeros((len(substrings), max_length, len(vocab)), dtype=float)
Y = np.zeros((len(substrings), len(vocab)), dtype=float)
for i, substr in enumerate(substrings):
    for t, char in enumerate(substr):
        if char != '0':
            X[i, t, char_ind[char]] = 1
    Y[i, char_ind[target_char[i]]] = 1

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X, Y))

##### Batch prep

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

#### Create the model

##### String lookup functions

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

one_hot_chars = tf.keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=list(vocab), mask_token='0', output_mode='one_hot')

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

##### Class definition

In [None]:
class DrugNameModel(tf.keras.Model):
    def __init__(self, vocab_size, chars_from_ids, lstm_units, max_length):
        super().__init__(self)
        self.lstm = tf.keras.layers.LSTM(lstm_units,
                                         input_shape=(1, max_length, vocab_size))
        self.dense = tf.keras.layers.Dense(vocab_size, 
                                           activation='softmax')
        self.vocab_size = vocab_size
        self.chars_from_ids = chars_from_ids

    @tf.function
    def __call__(self, inputs, training=False):
        x = inputs
        x = self.lstm(x, training=training)
        x = self.dense(x, training=training)
        return x

##### Instance

In [None]:
vocab_size = len(vocab)
lstm_units = 64

model = DrugNameModel(
            max_length=max_length,
            vocab_size=vocab_size,
            lstm_units=lstm_units,
            chars_from_ids=chars_from_ids)

##### Test untrained model

In [None]:

for samp_X, samp_Y in dataset.take(1):
    preds = model(inputs=samp_X, training=False)
    
print(preds)

In [None]:
sampled_ids = tf.random.categorical(preds, num_samples=1)
sampled_ids = tf.squeeze(sampled_ids, axis=-1)
sampled_ids

In [None]:
print("Next Char Predictions:\n", text_from_ids(sampled_ids))

##### Attatch optimizer, loss

In [None]:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import RMSprop
loss = CategoricalCrossentropy()
opt = RMSprop(learning_rate=0.005)
model.compile(optimizer=opt, loss=loss)

In [None]:
batch_mean_loss = loss(samp_Y, preds)
print("prediction shape: ", batch_mean_loss.shape)
print("mean loss: ", batch_mean_loss)

In [None]:
# Sanity check--the exponential mean loss should be approx equal to vocabulary size (~27)
tf.exp(batch_mean_loss)

##### Train

In [None]:
history = model.fit(dataset, 
                    epochs=8, 
                    verbose=1)

In [None]:
tf.saved_model.save(model, 'drug_model')

#### SavedModel API syntax

##### Low level api format
save: `tf.saved_model.save(model, 'model_name')`  
load: `model = tf.saved_model.load('model_name')`

##### High level api format
save: `model_subclassed.save('model_name')`  
load: `model = tf.keras.models.load_model('model_name')`

#### Name generator subclass

In [None]:
class OneName(tf.keras.Model):
    def __init__(self, model, one_hot_chars, chars_from_ids):
        super().__init__()
        self.model = model
        self.one_hot_chars = one_hot_chars
        self.chars_from_ids = chars_from_ids

    def vectorize_input(self, curr_string, i):
        split = tf.strings.unicode_split(curr_string, 'UTF-8')
        one_hot = one_hot_chars(split)
        paddings = ([31 - i, 0], [0, 0])
        x_pad = tf.pad(one_hot, paddings, "CONSTANT")
        one_hot_x = tf.reshape(x_pad, [1, 31, 27])
        return one_hot_x
    
    @tf.function(input_signature=[tf.TensorSpec(shape=(), dtype=tf.string)])
    def generate_word(self, inputs):
        name = inputs
        if (tf.strings.length(name) == 0):
            rand_int = tf.random.uniform(shape=[1], dtype=tf.int32, minval=1, maxval=27)[0]
            name += chars_from_ids(rand_int)
            
        length = tf.strings.length(name)
        for i in tf.range(length, 25):
            x = self.vectorize_input(name, i)
            y = self.model(x, training=False)
            y = tf.squeeze(y)
            samples = tf.math.top_k(y, k=3)
            samples = samples.indices
            id = tf.random.shuffle(samples)[0]
            x_char = chars_from_ids(id)
            if (x_char == '\n'):
                break
            name += x_char
        return name

In [None]:
name_generator = OneName(model, one_hot_chars, chars_from_ids)

In [None]:
tf.saved_model.save(name_generator, 'drug_generator')

#### Develop Drugs!

In [23]:
generator = tf.saved_model.load('drug_generator')

In [40]:
new_drugs = set()
prefix = ''
for i in range(500):
    name = generator.generate_word(prefix).numpy().decode('utf-8')
    if (len(name) >= 12): continue
    new_drugs.add(name)

In [41]:
with open('fake_drugs.txt', 'w') as f:
    for s in new_drugs:
        f.write(f'{s}\n')