# Preamble
raw csv $\rightarrow$ tokenized data $\rightarrow$ training/validation/test splits

In [3]:
from __future__ import division
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import StratifiedShuffleSplit

# Load raw csv
labels = pd.read_csv('../tmp/1_METONLY_vs_METX/matched_CEM_table.csv', header=0).rename({'Unnamed: 0': 'PIN'}, axis=1)[['PIN', 'CLASS']].set_index('PIN')
data = pd.read_csv('../tmp/raw_sequences2.csv', header=0, index_col=0).loc[labels.index, 'seq']
df = pd.DataFrame(columns=['Seq', 'Class'], index=data.index)
df.loc[:, 'Seq'] = data
df.loc[:, 'Class'] = labels['CLASS']

# Define tokenizer object
tokenizer = Tokenizer(char_level=False, lower=False, split=' ')

# Fit on corpus and extract tokenized sequences
tokenizer.fit_on_texts(df['Seq'])
n_feat = len(tokenizer.word_index.keys())
seq = tokenizer.texts_to_sequences(df['Seq'])

# Pad tokenized sequences
lengths = [len(x) for x in seq]
maxlen = int(np.percentile(lengths, 95))
p_seq = pad_sequences(seq, maxlen=maxlen)

# Full dataset
y = df['Class'].ravel()
X = p_seq

# Learn / Test
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
learn_idx, test_idx = next(sss.split(X, y))

X_learn, y_learn = X[learn_idx, :], y[learn_idx]
X_test, y_test = X[test_idx, :], y[test_idx]

print('{} learn / {} test'.format(len(y_learn), len(y_test)))

# Training / Validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=1)
train_idx, valid_idx = next(sss.split(X_learn, y_learn))

X_train, y_train = X_learn[train_idx, :], y_learn[train_idx]
X_valid, y_valid = X_learn[valid_idx, :], y_learn[valid_idx]

print('{} training / {} validation'.format(len(y_train), len(y_valid)))

Using TensorFlow backend.


5872 learn / 5872 test
5284 training / 588 validation


In [4]:
X_train.shape

(5284, 889)

# Simple pytorch LSTM

In [5]:
import torch

X_train_ft = torch.LongTensor(X_train)
y_train_ft = torch.LongTensor(y_train)
X_valid_ft = torch.LongTensor(X_valid)
y_valid_ft = torch.LongTensor(y_valid)
X_test_ft = torch.LongTensor(X_test)
y_test_ft = torch.LongTensor(y_test)

In [6]:
from torch.utils.data import DataLoader, TensorDataset
batch_size = 128

training_set = TensorDataset(X_train_ft, y_train_ft)
train_loader = DataLoader(training_set,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=4)

valid_set = TensorDataset(X_valid_ft, y_valid_ft)
valid_loader = DataLoader(valid_set,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=4)

test_set = TensorDataset(X_test_ft, y_test_ft)
test_loader = DataLoader(test_set,
                         batch_size=batch_size,
                         shuffle=False,
                         num_workers=4)

In [19]:
import torch.nn as nn
from torch import optim
from torch.autograd import Variable

class LSTMClassifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size,
                 label_size, batch_size, use_gpu, lstm_dropout):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.use_gpu = use_gpu

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=lstm_dropout)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        if self.use_gpu:
            h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
            c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).cuda())
        else:
            h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
            c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        x = embeds.view(len(sentence), self.batch_size, -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y  = self.hidden2label(lstm_out[-1])
        return y

In [21]:
embedding_dim = 64
hidden_dim = 32
sentence_len = maxlen
learning_rate = 0.05
lstm_droput = 0.25

### create model
model = LSTMClassifier(embedding_dim=embedding_dim,
                       hidden_dim=hidden_dim,
                       lstm_dropout=lstm_droput,
                       vocab_size=n_feat,
                       label_size=2,
                       batch_size=128,
                       use_gpu=True)
model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

In [22]:
n_epochs = 5
train_loss_ = []
valid_loss_ = []
train_acc_ = []
valid_acc_ = []

### training procedure
for i, epoch in enumerate(range(n_epochs)):
    ## training epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    
    ## Trainining epoch
    for _iter, traindata in enumerate(train_loader):
        train_inputs, train_labels = traindata
        train_labels = torch.squeeze(train_labels)
        
        train_inputs, train_labels = Variable(train_inputs.cuda()), Variable(train_labels.cuda())

        model.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
        output = model(train_inputs.t())
        
        loss = loss_function(output, train_labels)
        loss.backward()
        optimizer.step()
        
        # calc training acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == train_labels.data).sum()
        total += len(train_labels)
        total_loss += loss.data[0]
        
    train_loss_.append(total_loss / total)
    train_acc_.append(total_acc / total)
    
    ## Validation epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    for _iter, validdata in enumerate(valid_loader):
        valid_inputs, valid_labels = validdata
        valid_labels = torch.squeeze(valid_labels)

        valid_inputs, valid_labels = Variable(valid_inputs.cuda()), Variable(valid_labels.cuda())

        model.batch_size = len(valid_labels)
        model.hidden = model.init_hidden()
        output = model(valid_inputs.t())

        loss = loss_function(output, valid_labels)

        # calc testing acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == valid_labels.data).sum()
        total += len(valid_labels)
        total_loss += loss.data[0]
    
    valid_loss_.append(total_loss / total)
    valid_acc_.append(total_acc / total)
    
    print('[epoch {}]: loss: {:1.5e} acc: {:1.5f} '
          '| valid loss: {:1.5e} valid acc {:1.5f}'.format(i+1, train_loss_[-1],
                                                           train_acc_[-1], valid_loss_[-1],
                                                           valid_acc_[-1]))

[epoch 1]: loss: 5.04656e-03 acc: 0.63531 | valid loss: 5.34040e-03 valid acc 0.67517
[epoch 2]: loss: 4.63650e-03 acc: 0.69550 | valid loss: 5.33416e-03 valid acc 0.67687
[epoch 3]: loss: 4.48323e-03 acc: 0.70988 | valid loss: 5.28331e-03 valid acc 0.68878
[epoch 4]: loss: 4.41308e-03 acc: 0.70742 | valid loss: 5.38647e-03 valid acc 0.66837
[epoch 5]: loss: 4.32866e-03 acc: 0.71915 | valid loss: 5.39054e-03 valid acc 0.65816
