In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
df_train = pd.read_csv('data/drugsComTest_train.csv')
df_test = pd.read_csv('data/drugsComTest_test.csv')

In [3]:
#can experiment with more processing
import re
df_train['review'] = df_train['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))
df_test['review'] = df_test['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(df_train['condition'].values)
test_y = le.transform(df_test['condition'].values)

In [5]:
from util import *

In [6]:
vocab_list = prepare_vocab(df_train['review'])
w2id_dict = prepare_word_dict(vocab_list)
id2w_dict = {i:w for w, i in w2id_dict.items()}

In [7]:
padded_sents_train = sent_to_idx(df_train['review'], w2id_dict)
padded_sents_test = sent_to_idx(df_test['review'], w2id_dict)

In [8]:
import torch.nn as nn
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
class CNN_Text(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_filters, n_output, filter_sizes=[1,2,3,5], drop_p = 0.3):
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, n_filters, (K, n_embed)) for K in filter_sizes])
        self.dropout = nn.Dropout(drop_p)
        self.fc1 = nn.Linear(len(filter_sizes)*n_filters, n_output)
        self.relu = nn.ReLU()

    def forward(self, tokenized_idx):
        x = self.embedding(tokenized_idx)  
        x = x.unsqueeze(1)  
        x = [self.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [nn.functional.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        x = self.dropout(x)  
        logit = self.fc1(x) 
        return logit

In [18]:
n_epochs = 10
n_vocab = len(w2id_dict)
n_embed = 64
n_filters = 36
n_output = 6
batch_size = 100

model = CNN_Text(n_vocab, n_embed, n_filters, n_output)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=0.001)

In [19]:
model.cuda()
x_train = torch.tensor(padded_sents_train, dtype=torch.long).cuda()
y_train = torch.tensor(train_y, dtype=torch.long).cuda()
x_test = torch.tensor(padded_sents_test, dtype=torch.long).cuda()
y_test = torch.tensor(test_y, dtype=torch.long).cuda()

In [20]:
# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_test, y_test)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

In [21]:
n_epochs = 100
train_loss = []
valid_loss = []

for epoch in range(n_epochs):
    start_time = time.time()
    model.train()
    avg_loss = 0.  
    for x_batch, y_batch in train_loader:
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    # Set model to validation configuration -Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_test),len(le.classes_)))
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] = nn.functional.softmax(y_pred).cpu().numpy()

    # Check Accuracy
    val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))

  val_preds[i * batch_size:(i+1) * batch_size] = nn.functional.softmax(y_pred).cpu().numpy()


Epoch 1/100 	 loss=129.3258 	 val_loss=84.4121  	 val_acc=0.6986  	 time=3.39s
Epoch 2/100 	 loss=78.9565 	 val_loss=62.7320  	 val_acc=0.7720  	 time=3.29s
Epoch 3/100 	 loss=62.5451 	 val_loss=54.1366  	 val_acc=0.8083  	 time=3.30s
Epoch 4/100 	 loss=54.0054 	 val_loss=49.6843  	 val_acc=0.8270  	 time=3.31s


KeyboardInterrupt: 

In [22]:
filepath = 'model/cnn_condition'
torch.save(model.state_dict(), filepath)