In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

import pandas as pd
import numpy as np

import nltk

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os 
import time
import joblib
import shutil

## Define adjustable parameters here

In [2]:
BATCH_SIZE=32
EMBEDDING_DIM = 32
LSTM_OUT_DIM = 64
BIDIRECTIONAL = True
DROPOUT_PROB = 0.5

TOKENIZER_TOP_N_WORDS = 2000  # Increasing this would increase embedding size
TEXT_WORD_LIMIT = 3000

CUDA_AVAILABLE = torch.cuda.is_available()

PRINT_FREQUENCY = 1

CHECKPOINT_FOLDER = 'pytorchckpts/3smallnet'

NUM_EPOCHS = 3  # You can change this to number of epochs you want the model to go through

In [3]:
# Do not change unless you know what this does. No need to change this even when resuming
BEST_LOSS = np.inf 
EPOCH = 0 
CHECKPOINT_NAME = os.path.join(CHECKPOINT_FOLDER, 'checkpoint.pth.tar')
BEST_CHECKPOINT_NAME = os.path.join(CHECKPOINT_FOLDER, 'model_best.pth.tar')

## Load and merge datasets

In [4]:
df_train_txt = pd.read_csv('training_text', sep='\|\|', header=None, skiprows=1, names=["ID","Text"], engine='python')
df_train_var = pd.read_csv('training_variants')
df_test_txt = pd.read_csv('test_text', sep='\|\|', header=None, skiprows=1, names=["ID","Text"], engine='python')
df_test_var = pd.read_csv('test_variants')
df_train = pd.merge(df_train_var, df_train_txt, how='left', on='ID')
df_test = pd.merge(df_test_var, df_test_txt, how='left', on='ID')

In [5]:
# Split off a validation set
df_train, df_val = train_test_split(df_train, test_size = 0.2, random_state = 42, stratify=df_train['Class'].values)


## Build dataset loader

In [6]:
class SentencesDataset(Dataset):
    def __init__(self, df, word_to_ix, word_limit=2000):
        """
        Args:
            df: pandas dataframe of same format as df_train/df_test
            
            word_to_ix: word to index dictionary
            
            word_limit: Number of words to limit
        """
        self.df = df
        self.word_to_ix = word_to_ix
        self.word_limit = word_limit
        if 'Class' in df:
            self.le = LabelEncoder().fit(df['Class'].values)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df['Text'].values[idx]
        gene = self.df['Gene'].values[idx]
        variation = self.df['Variation'].values[idx]
        
        # Tokenize text. If word count becomes greater than limit, break.
        num_words = 0
        encoded_tokenized_text = []
        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                if word in self.word_to_ix:
                    encoded_tokenized_text.append(self.word_to_ix[word])
                    num_words += 1
                    if num_words >= self.word_limit: break
            if num_words >= self.word_limit: break
        
        # Special case: number of tokenized words = 0. Change to single word of unknown
        if num_words == 0:
            encoded_tokenized_text = [self.word_to_ix['Unknown']]
            print('FOUND NULL SENTENCE!!!')
            num_words += 1
        
        # Pad tokenized text if needed
        if num_words < self.word_limit:
            encoded_tokenized_text += [0] * (self.word_limit - len(encoded_tokenized_text))
        
        # Create sample
        sample = {'text': encoded_tokenized_text,
                  'length': num_words,
                  'gene': self.word_to_ix[gene] if gene in self.word_to_ix else self.word_to_ix['Unknown'],
                  'variation': self.word_to_ix[variation] if variation in self.word_to_ix else self.word_to_ix['Unknown']}
        
        # If contains class, include class in sample
        if 'Class' in self.df:
            sample['class'] =  self.le.transform([self.df['Class'].values[idx]])[0]
        
        return sample

## Build word to index dictionary

In [7]:
def build_word_to_ix(df_train, location, top_n_words=TOKENIZER_TOP_N_WORDS):
    """Builds word_to_ix dictionary and saves it in location"""
    if os.path.exists(location):
        print('found pickled word_to_ix')
        return joblib.load(location)
    
    word_counts = dict()

    for doc in df_train['Text'].values:
        for sent in nltk.sent_tokenize(doc):
            for word in nltk.word_tokenize(sent):
                if word not in word_counts:
                    word_counts[word] = 1
                else:
                    word_counts[word] += 1
    
    wcounts = list(word_counts.items())
    wcounts.sort(key=lambda x: x[1], reverse=True)
    sorted_voc = [wc[0] for wc in wcounts]
    word_to_ix = dict(list(zip(sorted_voc, list(range(top_n_words)))))
    
    ix = len(word_to_ix)

    for gene in df_train['Gene'].values:
        if gene not in word_to_ix:
            word_to_ix[gene] = ix
            ix += 1

    for variation in df_train['Variation'].values:
        if variation not in word_to_ix:
            word_to_ix[variation] = ix
            ix += 1
    
    if 'Unknown' not in word_to_ix:
        word_to_ix['Unknown'] = ix
    
    joblib.dump(word_to_ix, location, compress=3)
    
    return word_to_ix

In [8]:
word_to_ix = build_word_to_ix(df_train, 'word_to_ix_3.pkl')

found pickled word_to_ix


In [9]:
train_dataset = SentencesDataset(df_train, word_to_ix, word_limit=TEXT_WORD_LIMIT)
train_dataset_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_dataset = SentencesDataset(df_val, word_to_ix, word_limit=TEXT_WORD_LIMIT)
val_dataset_loader = DataLoader(val_dataset, batch_size=45, shuffle=False)

In [15]:
for i, batch in enumerate(train_dataset_loader):
    print(i)
    if i > 1: 
        break

0
1
2


## Build model

In [10]:
class MyLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_out_dim, bidirectional, prob):
        super(MyLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.lstm_out_dim = lstm_out_dim
        self.num_directions = 2 if bidirectional else 1
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim*3, lstm_out_dim, bidirectional=bidirectional)
        self.linear = nn.Linear(lstm_out_dim*self.num_directions, 9)
        self.dropout = nn.Dropout(p=prob)  
        
    def forward(self, batch):
        batch_size = len(batch['length'])
        
        # Sort all tensors by length of the sequence in decreasing order (for packed padded sequences to work)
        length, indices = torch.sort(batch['length'], dim=0, descending=True)
        gene = batch['gene'][indices]
        variation = batch['variation'][indices]
        text_batch = torch.stack(batch['text'], 0)[:, indices]
        
        # Wrap all tensors around a variable. Send to GPU if possible.
        text_batch = Variable(text_batch)
        length = Variable(length)
        gene = Variable(gene)
        variation = Variable(variation)
        if CUDA_AVAILABLE:
            text_batch, length, gene, variation = text_batch.cuda(), length.cuda(), gene.cuda(), variation.cuda()
        
        # Pass text, gene, and variation to embedding
        embedded_text = self.embedding(text_batch)
        embedded_gene = self.embedding(gene)
        embedded_variation = self.embedding(variation)
        
        # Concatenate gene + variation with embedded text
        concatenated_embedded = torch.cat([embedded_text,
                                           torch.unsqueeze(embedded_gene, dim=0).expand(
                                               text_batch.data.shape[0], batch_size, self.embedding_dim),
                                           torch.unsqueeze(embedded_variation, dim=0).expand(
                                               text_batch.data.shape[0], batch_size, self.embedding_dim)
                                          ],
                                          dim=2)
        
        # Initialize hidden and cell states for LSTM
        h0, c0 = self.init_hidden_and_cell_states(batch_size, 1, self.num_directions, self.lstm_out_dim)
        
        # Pack sequence, run through LSTM, and unpack output
        packed_embedded_text = pack_padded_sequence(concatenated_embedded, list(length.data))
        packed_h, (packed_h_t, packed_c_t) = self.lstm(packed_embedded_text, (h0, c0))
        h, _ = pad_packed_sequence(packed_h, batch_first=True)
        
        # Use fancy indexing to retrieve LSTM outputs for last timestep in each sequence
        h = h[
            np.arange(batch_size).reshape(-1, 1).tolist(), 
            length.data.view(-1, 1) - 1, 
            list(range(self.lstm_out_dim*self.num_directions))
        ]
        
        # RELU
        h = F.relu(h)
        
        h = self.dropout(h)
        
        output = self.linear(h)
        
        log_probs = F.log_softmax(output)
        return log_probs, indices
    
    @staticmethod
    def init_hidden_and_cell_states(batch_size, num_layers, num_directions, hidden_size):
        hidden = Variable(torch.randn(num_layers*num_directions, batch_size, hidden_size))
        cell = Variable(torch.randn(num_layers*num_directions, batch_size, hidden_size))
        
        if CUDA_AVAILABLE:
            hidden, cell = hidden.cuda(), cell.cuda()
        
        return hidden, cell

## Define a few training helpers

In [11]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, batch in enumerate(val_loader):

        # compute output
        
        log_probas, indices = model.forward(batch)
        labels = Variable(batch['class'][indices])
        if CUDA_AVAILABLE: labels = labels.cuda()
        loss = loss_fn(log_probas, labels)

        losses.update(loss.data[0], len(batch['length']))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % PRINT_FREQUENCY == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                   i+1, len(val_loader), batch_time=batch_time, loss=losses))
    
    print(' * Loss {losses.avg:.3f}'.format(losses=losses))
    return losses.avg


def train(train_loader, model, criterion, optimizer, epoch, val_loader=None):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    
    model.train()

    end = time.time()
    for i, batch in enumerate(train_loader):
        
        # measure data loading time
        data_time.update(time.time() - end)

        # compute output
        model.zero_grad()
        log_probas, indices = model.forward(batch)
        
        labels = Variable(batch['class'][indices])
        if CUDA_AVAILABLE: labels = labels.cuda()
        
        loss = criterion(log_probas, labels)
        loss.backward()
        optimizer.step()

        # measure accuracy and record loss
        losses.update(loss.data[0], len(batch['length']))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % PRINT_FREQUENCY == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                   epoch, i+1, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses))
            
        if i % 10 == 0:
            validate(val_loader, model, criterion)
            model.train()


def save_checkpoint(state, is_best, filename=CHECKPOINT_NAME):
    if not os.path.exists(os.path.dirname(CHECKPOINT_NAME)):
        os.makedirs(os.path.dirname(CHECKPOINT_NAME))
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, BEST_CHECKPOINT_NAME)
        



## Define model, loss function, and optimizer

In [12]:
loss_fn = nn.NLLLoss()

model = MyLSTM(len(word_to_ix), EMBEDDING_DIM, LSTM_OUT_DIM, BIDIRECTIONAL, DROPOUT_PROB)
if CUDA_AVAILABLE:
    model.cuda()

optimizer = optim.Adam(model.parameters(), lr=0.01)

## Start main loop. If checkpoint exists, start from there

In [13]:
if os.path.exists(CHECKPOINT_NAME):
    print("=> loading checkpoint '{}'".format(CHECKPOINT_NAME))
    checkpoint = torch.load(CHECKPOINT_NAME)
    EPOCH = checkpoint['epoch']
    BEST_LOSS = checkpoint['best_loss']
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    print("=> loaded checkpoint '{}' (epoch {})"
          .format(CHECKPOINT_NAME, checkpoint['epoch']))
else:
    print("=> no checkpoint found at '{}'. Starting from scratch".format(CHECKPOINT_NAME))
    
for epoch in range(EPOCH, NUM_EPOCHS):
    train(train_dataset_loader, model, loss_fn, optimizer, epoch + 1, val_dataset_loader)
    loss = validate(val_dataset_loader, model, loss_fn)
    
    if loss < BEST_LOSS:
        print('{} better than previous best loss of {}'.format(loss, BEST_LOSS))
        BEST_LOSS = loss
        is_best = True
    else:
        is_best = Falsee
    
    save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_loss': BEST_LOSS,
            'optimizer' : optimizer.state_dict(),
        }, is_best
    )

=> no checkpoint found at 'pytorchckpts/3smallnet/checkpoint.pth.tar'. Starting from scratch
Epoch: [1][1/83]	Time 11.982 (11.982)	Data 1.759 (1.759)	Loss 2.1749 (2.1749)	
Test: [1/15]	Time 2.565 (2.565)	Loss 2.1008 (2.1008)	
Test: [2/15]	Time 2.555 (2.560)	Loss 2.0802 (2.0905)	
Test: [3/15]	Time 2.961 (2.693)	Loss 2.0988 (2.0933)	
Test: [4/15]	Time 2.966 (2.762)	Loss 2.1320 (2.1030)	
Test: [5/15]	Time 2.664 (2.742)	Loss 2.0771 (2.0978)	
Test: [6/15]	Time 2.851 (2.760)	Loss 2.0800 (2.0948)	
Test: [7/15]	Time 2.655 (2.745)	Loss 2.1077 (2.0967)	
FOUND NULL SENTENCE!!!
Test: [8/15]	Time 2.679 (2.737)	Loss 2.0955 (2.0965)	
Test: [9/15]	Time 2.563 (2.717)	Loss 2.0939 (2.0962)	
FOUND NULL SENTENCE!!!
Test: [10/15]	Time 2.569 (2.703)	Loss 2.0927 (2.0959)	
Test: [11/15]	Time 2.793 (2.711)	Loss 2.1293 (2.0989)	
Test: [12/15]	Time 2.441 (2.688)	Loss 2.0858 (2.0978)	
Test: [13/15]	Time 2.850 (2.701)	Loss 2.1310 (2.1004)	
Test: [14/15]	Time 2.627 (2.695)	Loss 2.1453 (2.1036)	
Test: [15/15]	Time 2.

In [20]:
loss

1.577482152702217

## Generate submissions

In [15]:
if os.path.exists(CHECKPOINT_NAME):
    print("=> loading checkpoint '{}'".format(BEST_CHECKPOINT_NAME))
    checkpoint = torch.load(BEST_CHECKPOINT_NAME)
    EPOCH = checkpoint['epoch']
    BEST_LOSS = checkpoint['best_loss']
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    print("=> loaded checkpoint '{}' (epoch {})"
          .format(BEST_CHECKPOINT_NAME, checkpoint['epoch']))
else:
    raise Exception("=> no checkpoint found at '{}'. Cannot generate submission".format(BEST_CHECKPOINT_NAME))

model.eval()

=> loading checkpoint 'pytorchckpts/1increasedseqlength/model_best.pth.tar'
=> loaded checkpoint 'pytorchckpts/1increasedseqlength/model_best.pth.tar' (epoch 1)


MyLSTM (
  (embedding): Embedding(4588, 128)
  (lstm): LSTM(384, 196, bidirectional=True)
  (linear): Linear (392 -> 9)
)

In [None]:
test_dataset = SentencesDataset(df_test, word_to_ix, word_limit=TEXT_WORD_LIMIT)
test_dataset_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

probas = []
for i, test_batch in enumerate(test_dataset_loader):
    if i % 10 == 0:
        print(i)
    log_probas, indices = model.forward(test_batch)
    _, orig_indices = torch.sort(indices)
    log_probas = log_probas.data.cpu()[orig_indices]
    probas.append(log_probas)


0


In [18]:
probas = torch.exp(torch.cat(probas, dim=0))
probas



 5.9359e-02  1.0463e-01  4.0240e-02  ...   2.3442e-01  2.4609e-02  1.1846e-02
 1.1853e-01  8.7576e-02  4.6069e-03  ...   4.1419e-01  8.8140e-03  4.6486e-03
 1.9026e-01  1.6926e-01  2.7235e-02  ...   1.7260e-01  1.2332e-02  8.0794e-03
                ...                   ⋱                   ...                
 6.2367e-02  9.1292e-02  2.1270e-02  ...   1.5810e-01  1.8779e-02  9.2628e-03
 9.6648e-02  1.9787e-01  7.6937e-02  ...   3.8579e-01  2.2144e-02  9.0063e-03
 1.2727e-01  1.4228e-01  2.8646e-02  ...   3.9214e-01  1.8107e-02  5.9765e-03
[torch.FloatTensor of size 5668x9]

In [20]:
probas.numpy()[0]

array([ 0.05935883,  0.1046342 ,  0.04023978,  0.29929909,  0.20350039,
        0.02209731,  0.23441558,  0.02460939,  0.01184551], dtype=float32)

In [21]:
np.unique(np.argmax(probas.numpy(), axis=1), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 8]),
 array([ 160,  120,    5, 1987,  289,   37, 3069,    1]))

In [22]:
submission_df = pd.DataFrame(probas.numpy(), columns=['class'+str(c+1) for c in range(9)])
submission_df['ID'] = df_test['ID']
submission_df.head()

Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9,ID
0,0.059359,0.104634,0.04024,0.299299,0.2035,0.022097,0.234416,0.024609,0.011846,0
1,0.11853,0.087576,0.004607,0.160645,0.194606,0.006379,0.414194,0.008814,0.004649,1
2,0.190263,0.169263,0.027235,0.249991,0.149442,0.020791,0.172604,0.012332,0.008079,2
3,0.114072,0.100352,0.042725,0.349307,0.152393,0.023283,0.172244,0.030713,0.014911,3
4,0.071063,0.114701,0.21679,0.182893,0.128843,0.026818,0.224724,0.02538,0.008788,4


In [23]:
submission_df.to_csv('submission.csv', index=False)