In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
df_train = pd.read_csv('data/drugsComTest_train.csv')
df_test = pd.read_csv('data/drugsComTest_test.csv')

In [3]:
#can experiment with more processing
import re
df_train['review'] = df_train['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))
df_test['review'] = df_test['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(df_train['condition'].values)
test_y = le.transform(df_test['condition'].values)

In [5]:
from util import *

In [6]:
vocab_list = prepare_vocab(df_train['review'])
w2id_dict = prepare_word_dict(vocab_list)
id2w_dict = {i:w for w, i in w2id_dict.items()}

In [7]:
padded_sents_train = sent_to_idx(df_train['review'], w2id_dict)
padded_sents_test = sent_to_idx(df_test['review'], w2id_dict)

In [8]:
import torch.nn as nn
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class MLP(nn.Module):
    def __init__(self, n_vocab, n_embed, n_output, max_len, hidden_sizes = [256, 128, 64], dropout = 0.2):
        super().__init__()

        # embedding and convolution layers
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(n_embed*max_len, hidden_sizes[0])  # dense layer
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])  # dense layer
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])  # dense layer
        self.fc4 = nn.Linear(hidden_sizes[2], n_output)  # dense layer

    def forward(self, tokenized_idx):
        embedded = self.embedding(tokenized_idx)
        x = embedded.view(embedded.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        preds = self.fc4(x)
        return preds

In [10]:
n_epochs = 10
n_vocab = len(w2id_dict)
n_embed = 64
max_len = 50
n_output = 6
batch_size = 100

model = MLP(n_vocab, n_embed, n_output, max_len)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=0.001)

In [11]:
model.cuda()
x_train = torch.tensor(padded_sents_train, dtype=torch.long).cuda()
y_train = torch.tensor(train_y, dtype=torch.long).cuda()
x_test = torch.tensor(padded_sents_test, dtype=torch.long).cuda()
y_test = torch.tensor(test_y, dtype=torch.long).cuda()

In [12]:
# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_test, y_test)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

In [14]:
train_loss = []
valid_loss = []

for epoch in range(n_epochs):
    start_time = time.time()
    model.train()
    avg_loss = 0.  
    for x_batch, y_batch in train_loader:
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    # Set model to validation configuration -Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_test),len(le.classes_)))
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] = nn.functional.softmax(y_pred).cpu().numpy()

    # Check Accuracy
    val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))

  val_preds[i * batch_size:(i+1) * batch_size] = nn.functional.softmax(y_pred).cpu().numpy()


Epoch 1/10 	 loss=8.0528 	 val_loss=151.9132  	 val_acc=0.7222  	 time=0.58s
Epoch 2/10 	 loss=7.3833 	 val_loss=155.2295  	 val_acc=0.7274  	 time=0.52s
Epoch 3/10 	 loss=6.8750 	 val_loss=152.5064  	 val_acc=0.7284  	 time=0.52s
Epoch 4/10 	 loss=6.2262 	 val_loss=154.1834  	 val_acc=0.7309  	 time=0.54s
Epoch 5/10 	 loss=5.5124 	 val_loss=155.8282  	 val_acc=0.7344  	 time=0.52s
Epoch 6/10 	 loss=4.5409 	 val_loss=171.2732  	 val_acc=0.7322  	 time=0.54s
Epoch 7/10 	 loss=4.8694 	 val_loss=165.3856  	 val_acc=0.7364  	 time=0.51s
Epoch 8/10 	 loss=4.5360 	 val_loss=167.0269  	 val_acc=0.7387  	 time=0.51s
Epoch 9/10 	 loss=4.4156 	 val_loss=174.4077  	 val_acc=0.7382  	 time=0.51s
Epoch 10/10 	 loss=5.1412 	 val_loss=171.8922  	 val_acc=0.7412  	 time=0.51s


In [15]:
filepath = 'model/mlp_condition'
torch.save(model.state_dict(), filepath)