In [98]:
import pandas as pd
import numpy as np
import time

In [33]:
df_train = pd.read_csv('data/drugsComTest_train.csv')
df_test = pd.read_csv('data/drugsComTest_test.csv')

In [34]:
df_train.head(1)

Unnamed: 0,review,condition
0,"""I&#039;ve been taking it for a few years, so ...",Birth Control


In [35]:
df_test.head(1)

Unnamed: 0,review,condition
0,"""Unfortunetly abilify didn&#039;t work for me,...",Depression


In [36]:
#can experiment with more processing
import re
df_train['review'] = df_train['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))
df_test['review'] = df_test['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))

In [37]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(df_train['condition'].values)
test_y = le.transform(df_test['condition'].values)

In [78]:
from util import *

In [79]:
vocab_list = prepare_vocab(df_train['review'])
w2id_dict = prepare_word_dict(vocab_list)
id2w_dict = {i:w for w, i in w2id_dict.items()}

In [83]:
padded_sents_train = sent_to_idx(df_train['review'], w2id_dict)
padded_sents_test = sent_to_idx(df_test['review'], w2id_dict)

MODEL

LSTM

In [85]:
import torch.nn as nn
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [128]:
class LSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers=1, drop_p = 0.8):
        super().__init__()
        self.n_vocab = n_vocab  
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        #concat of avg_pool and max_pool generate twice the dimension
        self.fc = nn.Linear(n_hidden*2, n_hidden//2)
        self.relu = nn.ReLU()
        self.out = nn.Linear(n_hidden//2, n_output) 
        
    def forward(self, tokenized_idx):
        embeddings = self.embedding(tokenized_idx)
        lstm_out, _ = self.lstm(embeddings)
        avg_pool = torch.mean(lstm_out, 1)
        max_pool, _ = torch.max(lstm_out, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.fc(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [129]:
le.classes_

array(['Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
       'Depression', 'Pain'], dtype=object)

In [130]:
n_epochs = 10
n_vocab = len(w2id_dict)
n_embed = 64
n_hidden = 100
n_output = 6
n_layers = 2
batch_size = 100

model = LSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=0.001)

In [131]:
model.cuda()
x_train = torch.tensor(padded_sents_train, dtype=torch.long).cuda()
y_train = torch.tensor(train_y, dtype=torch.long).cuda()
x_test = torch.tensor(padded_sents_test, dtype=torch.long).cuda()
y_test = torch.tensor(test_y, dtype=torch.long).cuda()

In [132]:
# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_test, y_test)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

In [134]:
n_epochs = 100
train_loss = []
valid_loss = []

for epoch in range(n_epochs):
    start_time = time.time()
    model.train()
    avg_loss = 0.  
    for x_batch, y_batch in train_loader:
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    # Set model to validation configuration -Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_test),len(le.classes_)))
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] = nn.functional.softmax(y_pred).cpu().numpy()

    # Check Accuracy
    val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))

  val_preds[i * batch_size:(i+1) * batch_size] = nn.functional.softmax(y_pred).cpu().numpy()


Epoch 1/100 	 loss=68.3196 	 val_loss=77.5651  	 val_acc=0.7349  	 time=1.58s
Epoch 2/100 	 loss=63.7386 	 val_loss=71.3206  	 val_acc=0.7364  	 time=1.33s
Epoch 3/100 	 loss=62.0838 	 val_loss=69.9812  	 val_acc=0.7379  	 time=1.33s
Epoch 4/100 	 loss=60.9789 	 val_loss=66.9399  	 val_acc=0.7640  	 time=1.34s
Epoch 5/100 	 loss=56.8759 	 val_loss=77.6947  	 val_acc=0.7610  	 time=1.33s
Epoch 6/100 	 loss=53.7744 	 val_loss=76.4093  	 val_acc=0.7667  	 time=1.31s
Epoch 7/100 	 loss=52.0550 	 val_loss=72.7950  	 val_acc=0.7737  	 time=1.32s
Epoch 8/100 	 loss=54.6803 	 val_loss=65.6027  	 val_acc=0.7587  	 time=1.31s
Epoch 9/100 	 loss=55.0412 	 val_loss=61.5821  	 val_acc=0.7750  	 time=1.33s
Epoch 10/100 	 loss=51.0044 	 val_loss=65.5553  	 val_acc=0.8025  	 time=1.34s
Epoch 11/100 	 loss=48.9758 	 val_loss=69.5631  	 val_acc=0.7927  	 time=1.32s
Epoch 12/100 	 loss=46.3535 	 val_loss=66.2062  	 val_acc=0.8068  	 time=1.33s
Epoch 13/100 	 loss=47.0988 	 val_loss=73.7277  	 val_acc=0.8

In [135]:
filepath = 'model/lstm_condition'
torch.save(model.state_dict(), filepath)

INFERENCE

In [137]:
class Model(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers=1, drop_p = 0.8):
        super().__init__()
        self.n_vocab = n_vocab  
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        #concat of avg_pool and max_pool generate twice the dimension
        self.fc = nn.Linear(n_hidden*2, n_hidden//2)
        self.relu = nn.ReLU()
        self.out = nn.Linear(n_hidden//2, n_output) 
        
    def forward(self, tokenized_idx):
        embeddings = self.embedding(tokenized_idx)
        lstm_out, _ = self.lstm(embeddings)
        avg_pool = torch.mean(lstm_out, 1)
        max_pool, _ = torch.max(lstm_out, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.fc(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [139]:
n_vocab = len(w2id_dict)
n_embed = 64
n_hidden = 100
n_output = 6
n_layers = 2
batch_size = 100

model = Model(n_vocab, n_embed, n_hidden, n_output, n_layers)
model.load_state_dict(torch.load(filepath))
model.eval()

Model(
  (embedding): Embedding(5001, 64)
  (lstm): LSTM(64, 100, num_layers=2, batch_first=True, dropout=0.8)
  (dropout): Dropout(p=0.8, inplace=False)
  (fc): Linear(in_features=200, out_features=50, bias=True)
  (relu): ReLU()
  (out): Linear(in_features=50, out_features=6, bias=True)
)

In [153]:
tmp =['taken it for a long time it is the only thing that works for me',
 'tried all the benzodiazepines with no luck then they tried buspar and it changed my life']

sent_idx = sent_to_idx(tmp, w2id_dict)
sent_idx = torch.tensor(sent_idx, dtype=torch.long)
pred = model(sent_idx).detach()
pred = nn.functional.softmax(pred, 1).numpy()
pred = pred.argmax(axis=1)
pred = le.classes_[pred]
pred

array(['Pain', 'Anxiety'], dtype=object)