In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

import pandas as pd
import numpy as np

import nltk

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os 
import time
import joblib
import shutil

## Define adjustable parameters here

In [15]:
BATCH_SIZE=32
EMBEDDING_DIM = 64
LSTM_OUT_DIM = 64
BIDIRECTIONAL = True
DROPOUT_PROB = 0.5
LINEAR_HIDDEN_SIZE = 64

TOKENIZER_TOP_N_WORDS = 2000  # Increasing this would increase embedding size
TEXT_WORD_LIMIT = 2000

CUDA_AVAILABLE = torch.cuda.is_available()

PRINT_FREQUENCY = 1

CHECKPOINT_FOLDER = 'pytorchckpts/4newarchsmall'

NUM_EPOCHS = 5  # You can change this to number of epochs you want the model to go through

In [3]:
# Do not change unless you know what this does. No need to change this even when resuming
BEST_LOSS = np.inf 
EPOCH = 0 
CHECKPOINT_NAME = os.path.join(CHECKPOINT_FOLDER, 'checkpoint.pth.tar')
BEST_CHECKPOINT_NAME = os.path.join(CHECKPOINT_FOLDER, 'model_best.pth.tar')

## Load and merge datasets

In [4]:
df_train_txt = pd.read_csv('training_text', sep='\|\|', header=None, skiprows=1, names=["ID","Text"], engine='python')
df_train_var = pd.read_csv('training_variants')
df_test_txt = pd.read_csv('test_text', sep='\|\|', header=None, skiprows=1, names=["ID","Text"], engine='python')
df_test_var = pd.read_csv('test_variants')
df_train = pd.merge(df_train_var, df_train_txt, how='left', on='ID')
df_test = pd.merge(df_test_var, df_test_txt, how='left', on='ID')

In [5]:
# Split off a validation set
df_train, df_val = train_test_split(df_train, test_size = 0.2, random_state = 42, stratify=df_train['Class'].values)


## Build dataset loader

In [6]:
class SentencesDataset(Dataset):
    def __init__(self, df, word_to_ix, word_limit=2000):
        """
        Args:
            df: pandas dataframe of same format as df_train/df_test
            
            word_to_ix: word to index dictionary
            
            word_limit: Number of words to limit
        """
        self.df = df
        self.word_to_ix = word_to_ix
        self.word_limit = word_limit
        if 'Class' in df:
            self.le = LabelEncoder().fit(df['Class'].values)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df['Text'].values[idx]
        gene = self.df['Gene'].values[idx]
        variation = self.df['Variation'].values[idx]
        
        # Tokenize text. If word count becomes greater than limit, break.
        num_words = 0
        encoded_tokenized_text = []
        gene_same = []
        variation_same = []
        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                if word in self.word_to_ix:
                    encoded_tokenized_text.append(self.word_to_ix[word])
                    num_words += 1
                    if word == gene:
                        gene_same.append(1)
                    else:
                        gene_same.append(0)
                    if word == variation:
                        variation_same.append(1)
                    else:
                        variation_same.append(0)
                    
                else:  # Word not in word_to_ix. Still, you must check if word is the gene/variation or not
                    if word == gene or word == variation:
                        if word == gene:
                            gene_same.append(1)
                        else:
                            gene_same.append(0)
                        if word == variation:
                            variation_same.append(1)
                        else:
                            variation_same.append(0)
                        encoded_tokenized_text.append(self.word_to_ix['Unknown'])
                        num_words += 1
                        
                if num_words >= self.word_limit: break
                
            if num_words >= self.word_limit: break
        
        # Special case: number of tokenized words = 0. Change to single word of unknown
        if num_words == 0:
            encoded_tokenized_text = [self.word_to_ix['Unknown']]
            gene_same.append(0)
            variation_same.append(0)
            print('FOUND NULL SENTENCE!!!')
            num_words += 1
        
        # Pad tokenized text if needed
        if num_words < self.word_limit:
            encoded_tokenized_text += [0] * (self.word_limit - num_words)
            gene_same += [0] * (self.word_limit - num_words)
            variation_same += [0] * (self.word_limit - num_words)
        
        # Create sample
        sample = {'text': encoded_tokenized_text,
                  'gene_same': gene_same,
                  'variation_same': variation_same,
                  'length': num_words,
                  'gene': self.word_to_ix[gene] if gene in self.word_to_ix else self.word_to_ix['Unknown'],
                  'variation': self.word_to_ix[variation] if variation in self.word_to_ix else self.word_to_ix['Unknown']}
        
        # If contains class, include class in sample
        if 'Class' in self.df:
            sample['class'] =  self.le.transform([self.df['Class'].values[idx]])[0]
        
        return sample

## Build word to index dictionary

In [7]:
def build_word_to_ix(df_train, location, top_n_words=TOKENIZER_TOP_N_WORDS):
    """Builds word_to_ix dictionary and saves it in location"""
    if os.path.exists(location):
        print('found pickled word_to_ix')
        return joblib.load(location)
    
    word_counts = dict()

    for doc in df_train['Text'].values:
        for sent in nltk.sent_tokenize(doc):
            for word in nltk.word_tokenize(sent):
                if word not in word_counts:
                    word_counts[word] = 1
                else:
                    word_counts[word] += 1
    
    wcounts = list(word_counts.items())
    wcounts.sort(key=lambda x: x[1], reverse=True)
    sorted_voc = [wc[0] for wc in wcounts]
    word_to_ix = dict(list(zip(sorted_voc, list(range(top_n_words)))))
    
    ix = len(word_to_ix)

    for gene in df_train['Gene'].values:
        if gene not in word_to_ix:
            word_to_ix[gene] = ix
            ix += 1

    for variation in df_train['Variation'].values:
        if variation not in word_to_ix:
            word_to_ix[variation] = ix
            ix += 1
    
    if 'Unknown' not in word_to_ix:
        word_to_ix['Unknown'] = ix
    
    joblib.dump(word_to_ix, location, compress=3)
    
    return word_to_ix

In [8]:
word_to_ix = build_word_to_ix(df_train, 'word_to_ix_3.pkl')

found pickled word_to_ix


In [9]:
train_dataset = SentencesDataset(df_train, word_to_ix, word_limit=TEXT_WORD_LIMIT)
train_dataset_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_dataset = SentencesDataset(df_val, word_to_ix, word_limit=TEXT_WORD_LIMIT)
val_dataset_loader = DataLoader(val_dataset, batch_size=45, shuffle=False)

In [41]:
for i, batch in enumerate(train_dataset_loader):
    print(i)
    print(torch.stack(batch['gene_same']).size())
    print(np.unique(torch.stack(batch['gene_same']).numpy(), return_counts=True))
    print(np.unique(torch.stack(batch['variation_same']).numpy(), return_counts=True))
    print(len(batch['text']))
    if i > 1: 
        break

0
torch.Size([3000, 32])
(array([0, 1]), array([94808,  1192]))
(array([0, 1]), array([95952,    48]))
3000
1
torch.Size([3000, 32])
(array([0, 1]), array([94515,  1485]))
(array([0, 1]), array([95901,    99]))
3000
2
torch.Size([3000, 32])
(array([0, 1]), array([94840,  1160]))
(array([0, 1]), array([95859,   141]))
3000


## Build model

In [10]:
class MyLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_out_dim, bidirectional, prob):
        super(MyLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.lstm_out_dim = lstm_out_dim
        self.num_directions = 2 if bidirectional else 1
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim+2, lstm_out_dim, bidirectional=bidirectional)
        self.linear = nn.Linear(lstm_out_dim*self.num_directions+2*embedding_dim, 9)
        self.dropout = nn.Dropout(p=prob)  
        
    def forward(self, batch):
        batch_size = len(batch['length'])
        
        # Sort all tensors by length of the sequence in decreasing order (for packed padded sequences to work)
        length, indices = torch.sort(batch['length'], dim=0, descending=True)
        gene = batch['gene'][indices]
        variation = batch['variation'][indices]
        text_batch = torch.stack(batch['text'], 0)[:, indices]
        gene_same = torch.stack(batch['gene_same'], 0)[:, indices]
        variation_same = torch.stack(batch['variation_same'], 0)[:, indices]
        
        # Wrap all tensors around a variable. Send to GPU if possible.
        text_batch = Variable(text_batch)
        length = Variable(length)
        gene = Variable(gene)
        variation = Variable(variation)
        gene_same = Variable(gene_same)
        variation_same = Variable(variation_same)
        if CUDA_AVAILABLE:
            text_batch, length, gene, variation, gene_same, variation_same = \
                text_batch.cuda(), length.cuda(), gene.cuda(), variation.cuda(), gene_same.cuda(), variation_same.cuda()
        
        # Pass text, gene, and variation to embedding
        embedded_text = self.embedding(text_batch)
        embedded_gene = self.embedding(gene)
        embedded_variation = self.embedding(variation)
        
        # Concatenate gene + variation with embedded text
        concatenated_embedded = torch.cat([embedded_text,
                                           torch.unsqueeze(gene_same, dim=2).float(),
                                           torch.unsqueeze(variation_same, dim=2).float()
                                          ],
                                          dim=2)
        
        # Initialize hidden and cell states for LSTM
        h0, c0 = self.init_hidden_and_cell_states(batch_size, 1, self.num_directions, self.lstm_out_dim)
        
        # Pack sequence, run through LSTM, and unpack output
        packed_embedded_text = pack_padded_sequence(concatenated_embedded, list(length.data))
        packed_h, (packed_h_t, packed_c_t) = self.lstm(packed_embedded_text, (h0, c0))
        h, _ = pad_packed_sequence(packed_h, batch_first=True)
        
        # Use fancy indexing to retrieve LSTM outputs for last timestep in each sequence
        h = h[
            np.arange(batch_size).reshape(-1, 1).tolist(), 
            length.data.view(-1, 1) - 1, 
            list(range(self.lstm_out_dim*self.num_directions))
        ]
        
        concatenated_h = torch.cat(
            [
                h,
                embedded_gene,
                embedded_variation
            ],
            dim=1
        )
        
        # RELU
        concatenated_h = F.relu(concatenated_h)
        
        concatenated_h = self.dropout(concatenated_h)
        
        output = self.linear(concatenated_h)
        
        log_probs = F.log_softmax(output)
        return log_probs, indices
    
    @staticmethod
    def init_hidden_and_cell_states(batch_size, num_layers, num_directions, hidden_size):
        hidden = Variable(torch.randn(num_layers*num_directions, batch_size, hidden_size))
        cell = Variable(torch.randn(num_layers*num_directions, batch_size, hidden_size))
        
        if CUDA_AVAILABLE:
            hidden, cell = hidden.cuda(), cell.cuda()
        
        return hidden, cell

In [11]:
#model = MyLSTM(len(word_to_ix), EMBEDDING_DIM, LSTM_OUT_DIM, BIDIRECTIONAL, DROPOUT_PROB)
#if CUDA_AVAILABLE: model.cuda()
#    
#for i, batch in enumerate(train_dataset_loader):
#    print(i)
#    out = model.forward(batch)
#    break

## Define a few training helpers

In [12]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, batch in enumerate(val_loader):

        # compute output
        
        log_probas, indices = model.forward(batch)
        labels = Variable(batch['class'][indices])
        if CUDA_AVAILABLE: labels = labels.cuda()
        loss = loss_fn(log_probas, labels)

        losses.update(loss.data[0], len(batch['length']))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % PRINT_FREQUENCY == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                   i+1, len(val_loader), batch_time=batch_time, loss=losses))
    
    print(' * Loss {losses.avg:.3f}'.format(losses=losses))
    return losses.avg


def train(train_loader, model, criterion, optimizer, epoch, val_loader=None):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    
    model.train()

    end = time.time()
    for i, batch in enumerate(train_loader):
        
        # measure data loading time
        data_time.update(time.time() - end)

        # compute output
        model.zero_grad()
        log_probas, indices = model.forward(batch)
        
        labels = Variable(batch['class'][indices])
        if CUDA_AVAILABLE: labels = labels.cuda()
        
        loss = criterion(log_probas, labels)
        loss.backward()
        optimizer.step()

        # measure accuracy and record loss
        losses.update(loss.data[0], len(batch['length']))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % PRINT_FREQUENCY == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                   epoch, i+1, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses))
            
        if i % 10 == 0:
            validate(val_loader, model, criterion)
            model.train()


def save_checkpoint(state, is_best, filename=CHECKPOINT_NAME):
    if not os.path.exists(os.path.dirname(CHECKPOINT_NAME)):
        os.makedirs(os.path.dirname(CHECKPOINT_NAME))
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, BEST_CHECKPOINT_NAME)
        



## Define model, loss function, and optimizer

In [13]:
loss_fn = nn.NLLLoss()

model = MyLSTM(len(word_to_ix), EMBEDDING_DIM, LSTM_OUT_DIM, BIDIRECTIONAL, DROPOUT_PROB)
if CUDA_AVAILABLE:
    model.cuda()

optimizer = optim.Adam(model.parameters(), lr=0.01)

## Start main loop. If checkpoint exists, start from there

In [16]:
if os.path.exists(CHECKPOINT_NAME):
    print("=> loading checkpoint '{}'".format(CHECKPOINT_NAME))
    checkpoint = torch.load(CHECKPOINT_NAME)
    EPOCH = checkpoint['epoch']
    BEST_LOSS = checkpoint['best_loss']
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    print("=> loaded checkpoint '{}' (epoch {})"
          .format(CHECKPOINT_NAME, checkpoint['epoch']))
else:
    print("=> no checkpoint found at '{}'. Starting from scratch".format(CHECKPOINT_NAME))
    
for epoch in range(EPOCH, NUM_EPOCHS):
    train(train_dataset_loader, model, loss_fn, optimizer, epoch + 1, val_dataset_loader)
    loss = validate(val_dataset_loader, model, loss_fn)
    
    if loss < BEST_LOSS:
        print('{} better than previous best loss of {}'.format(loss, BEST_LOSS))
        BEST_LOSS = loss
        is_best = True
    else:
        is_best = False
    
    save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_loss': BEST_LOSS,
            'optimizer' : optimizer.state_dict(),
        }, is_best
    )

=> loading checkpoint 'pytorchckpts/4newarchsmall/checkpoint.pth.tar'
=> loaded checkpoint 'pytorchckpts/4newarchsmall/checkpoint.pth.tar' (epoch 4)
Epoch: [5][1/83]	Time 5.429 (5.429)	Data 1.407 (1.407)	Loss 0.8788 (0.8788)	
Test: [1/15]	Time 2.079 (2.079)	Loss 1.1905 (1.1905)	
Test: [2/15]	Time 2.132 (2.106)	Loss 1.4684 (1.3294)	
Test: [3/15]	Time 2.378 (2.196)	Loss 1.5748 (1.4112)	
Test: [4/15]	Time 2.484 (2.268)	Loss 1.4875 (1.4303)	
Test: [5/15]	Time 2.111 (2.237)	Loss 1.2381 (1.3919)	
Test: [6/15]	Time 2.443 (2.271)	Loss 0.9248 (1.3140)	
Test: [7/15]	Time 2.123 (2.250)	Loss 1.1377 (1.2888)	
FOUND NULL SENTENCE!!!
Test: [8/15]	Time 2.197 (2.243)	Loss 1.1806 (1.2753)	
Test: [9/15]	Time 2.048 (2.222)	Loss 1.2378 (1.2711)	
FOUND NULL SENTENCE!!!
Test: [10/15]	Time 2.159 (2.215)	Loss 1.1005 (1.2541)	
Test: [11/15]	Time 2.256 (2.219)	Loss 1.2305 (1.2519)	
Test: [12/15]	Time 2.039 (2.204)	Loss 0.8356 (1.2172)	
Test: [13/15]	Time 2.276 (2.210)	Loss 1.5837 (1.2454)	
Test: [14/15]	Time 2.2

1.4554716895397444

## Generate submissions

In [17]:
if os.path.exists(CHECKPOINT_NAME):
    print("=> loading checkpoint '{}'".format(BEST_CHECKPOINT_NAME))
    checkpoint = torch.load(BEST_CHECKPOINT_NAME)
    EPOCH = checkpoint['epoch']
    BEST_LOSS = checkpoint['best_loss']
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    print("=> loaded checkpoint '{}' (epoch {})"
          .format(BEST_CHECKPOINT_NAME, checkpoint['epoch']))
else:
    raise Exception("=> no checkpoint found at '{}'. Cannot generate submission".format(BEST_CHECKPOINT_NAME))

model.eval()

=> loading checkpoint 'pytorchckpts/4newarchsmall/model_best.pth.tar'
=> loaded checkpoint 'pytorchckpts/4newarchsmall/model_best.pth.tar' (epoch 2)


MyLSTM (
  (embedding): Embedding(4588, 64)
  (lstm): LSTM(66, 64, bidirectional=True)
  (linear): Linear (256 -> 9)
  (dropout): Dropout (p = 0.5)
)

In [18]:
test_dataset = SentencesDataset(df_test, word_to_ix, word_limit=TEXT_WORD_LIMIT)
test_dataset_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

probas = []
for i, test_batch in enumerate(test_dataset_loader):
    if i % 10 == 0:
        print(i)
    log_probas, indices = model.forward(test_batch)
    _, orig_indices = torch.sort(indices)
    log_probas = log_probas.data.cpu()[orig_indices]
    probas.append(log_probas)


0
10
FOUND NULL SENTENCE!!!
20
30
40
50


In [19]:
probas = torch.exp(torch.cat(probas, dim=0))
probas



 0.0622  0.1158  0.0011  ...   0.7752  0.0004  0.0012
 0.1170  0.0347  0.0009  ...   0.7903  0.0005  0.0008
 0.2793  0.0555  0.0008  ...   0.3545  0.0007  0.0016
          ...             ⋱             ...          
 0.1399  0.0802  0.0005  ...   0.6757  0.0009  0.0022
 0.2980  0.0875  0.0012  ...   0.4194  0.0013  0.0015
 0.1367  0.1685  0.0023  ...   0.5363  0.0006  0.0057
[torch.FloatTensor of size 5668x9]

In [20]:
probas.numpy()[0]

array([  6.21822216e-02,   1.15822256e-01,   1.07325823e-03,
         2.51004081e-02,   1.43704694e-02,   4.66867723e-03,
         7.75180221e-01,   3.96610150e-04,   1.20594550e-03], dtype=float32)

In [21]:
np.unique(np.argmax(probas.numpy(), axis=1), return_counts=True)

(array([0, 1, 3, 4, 5, 6, 8]),
 array([ 212,   23,  134,   19,   27, 5252,    1]))

In [22]:
submission_df = pd.DataFrame(probas.numpy(), columns=['class'+str(c+1) for c in range(9)])
submission_df['ID'] = df_test['ID']
submission_df.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,ID
0,0.062182,0.115822,0.001073,0.0251,0.01437,0.004669,0.77518,0.000397,0.001206,0
1,0.116994,0.034654,0.000936,0.023114,0.028921,0.003864,0.790295,0.000465,0.000756,1
2,0.279304,0.055468,0.000751,0.217349,0.075128,0.015165,0.354532,0.000658,0.001646,2
3,0.270914,0.126977,0.001761,0.088126,0.066954,0.017781,0.419938,0.004516,0.003032,3
4,0.32593,0.068644,0.001481,0.09563,0.081464,0.039915,0.381639,0.001762,0.003535,4


In [23]:
submission_df.to_csv('submission.csv', index=False)