In [1]:
import sys, os
import random
import torch
import torch.nn as nn
import torchtext
from torchtext import data
from torchtext import datasets
from tqdm import tqdm

In [4]:
from tensorboardX import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%m%d%H%M')
tbrun = os.path.join('/homes/du113/scratch/tensorboard/runs', 'alstm_reg')

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
def tokenizer(inputs):
    return inputs.split()

In [4]:
SENT = data.Field(tokenize=tokenizer)
TEXT = data.NestedField(SENT)
LABEL = data.LabelField(dtype=torch.float)

In [5]:
fields = {'text': ('t',TEXT), 'label': ('l',LABEL)}

In [6]:
train_data, test_data = data.TabularDataset.splits(
                            path = 'lstm_data',
                            train = 'train.json',
                            test = 'test.json',
                            format = 'json',
                            fields = fields
)

In [7]:
vars(train_data[0])

{'t': [['KINGSTON',
   'After',
   'more',
   'than',
   'a',
   'decade',
   'in',
   'a',
   'vegetative',
   'state',
   ',',
   '27',
   'year',
   'old',
   'Kingston',
   'man',
   'Alex',
   'Hinds',
   'opened',
   'his',
   'eyes',
   'last',
   'week',
   'much',
   'to',
   'the',
   'surprise',
   'of',
   'healthcare',
   'providers',
   '.'],
  ['When',
   'asked',
   'what',
   'he',
   'planned',
   'to',
   'do',
   'now',
   'that',
   'he',
   'was',
   'awake',
   ',',
   'Hinds',
   'nodded',
   'enthusiastically',
   'towards',
   'an',
   'attractive',
   'young',
   'nurse',
   ',',
   'smiled',
   'and',
   'said',
   ',',
   '"',
   'I',
   "'",
   'm',
   'going',
   'to',
   'ask',
   'her',
   'waaaaaaaaassssssup',
   '?',
   '.',
   '"',
   'Hinds',
   'then',
   'continued',
   ',',
   '"',
   'Maybe',
   ',',
   'I',
   "'",
   'll',
   'ask',
   'her',
   'to',
   'go',
   'see',
   'My',
   'Big',
   'Fat',
   'Greek',
   'Wedding',
   '2',
   '.',
   

In [9]:
vars(test_data[0])

{'t': [['Northern',
   'Ireland',
   'are',
   'relishing',
   'a',
   'rare',
   'chance',
   'to',
   'focus',
   'on',
   'attack',
   'against',
   'the',
   'Faroe',
   'Islands',
   ',',
   'according',
   'to',
   'manager',
   'Michael',
   'O',
   "'",
   'Neill',
   '.'],
  ['O',
   "'",
   'Neill',
   "'",
   's',
   'side',
   'do',
   'not',
   'go',
   'into',
   'many',
   'matches',
   'with',
   'a',
   'heavy',
   'expectation',
   'of',
   'victory',
   'but',
   'Saturday',
   "'",
   's',
   'home',
   'clash',
   'against',
   'a',
   'side',
   'ranked',
   '179th',
   'in',
   'the',
   'world',
   ',',
   'nestling',
   'neatly',
   'between',
   'Chinese',
   'Taipei',
   'and',
   'the',
   'Turks',
   'Caicos',
   'Islands',
   ',',
   'is',
   'one',
   'such',
   'occasion',
   '.'],
  ['Expectations',
   'are',
   'even',
   'loftier',
   'given',
   'last',
   'month',
   "'",
   's',
   'impressive',
   '2',
   '1',
   'away',
   'win',
   'over',
   'H

In [8]:
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [10]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 10010
Number of validation examples: 4290
Number of testing examples: 3300


In [13]:
sum([t.l for t in train_data])

924

In [14]:
sum([t.l for t in valid_data])

376

In [15]:
sum([t.l for t in test_data])

300

In [11]:
vec = torchtext.vocab.Vectors('glove100d.txt', cache='/homes/du113/scratch')

In [12]:
TEXT.build_vocab(train_data, valid_data, test_data, vectors=vec)
LABEL.build_vocab(train_data, valid_data, test_data)

In [13]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 136395
Unique tokens in LABEL vocabulary: 2


In [14]:
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'to', "'", 'of', 'a', 'and']
defaultdict(<function _default_unk_index at 0x7f3f59f232f0>, {0: 0, 1: 1})


In [15]:
import pickle
with open('/homes/du113/scratch/out_dom_data/vocabs/alstm_vocab.pkl', 'wb') as f:
    pickle.dump(TEXT.vocab, f)

In [20]:
BATCH_SIZE = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device, sort=False)

In [60]:
class LSTM(nn.Module):
    def __init__(self, voc_size, emb_dim, hidden_size):
        super(LSTM, self).__init__()
        self.embeds = nn.Embedding(voc_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim,
                           hidden_size=hidden_size,
                           batch_first=True,
                           bidirectional=False)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, input):
        # the input should have size [batch, num sent, sent len]
        
        embedded = self.embeds(input)
        
        # [batch, num sent, sent len, embed dim]
        
        embedded = embedded.mean(dim=2) # average over sentence dimension
        
        # [batch, num sent, embed dim]
        
        _, (embedded, _) = self.rnn(embedded)
        
        # dim = [batch, emb]
        
        return self.linear(embedded.squeeze(0))

In [61]:
voc_size = len(TEXT.vocab)
emb_dim = 100
hidden = 256

model = LSTM(voc_size, emb_dim, hidden)
model.embeds.from_pretrained(TEXT.vocab.vectors)

Embedding(136395, 100)

In [23]:
import torch.optim as optim

In [62]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [25]:
criterion = nn.BCEWithLogitsLoss()

In [26]:
class_weights=torch.Tensor([0.55, 5.54]).to(device)

In [63]:
model = model.to(device)
criterion = criterion.to(device)

In [28]:
from sklearn.metrics import f1_score

In [64]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    prediction = []
    label = []
    
    model.train()
    
    for batch in tqdm(iterator):
        
#         print(batch.t.shape)
        
        optimizer.zero_grad()
                
        predictions = model(batch.t).squeeze(1)
        
        criterion.weight = class_weights[batch.l.long()]
        
        loss = criterion(predictions, batch.l)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        prediction.append(
            torch.sigmoid(predictions).round().detach().cpu()
        )
        label.append(batch.l.detach().cpu())
        
    prediction = torch.cat(prediction).numpy()
    label = torch.cat(label).numpy()
    
    f1 = f1_score(label, prediction)
        
    return epoch_loss / len(iterator), f1

In [65]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    prediction = []
    label = []
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in tqdm(iterator):

            predictions = model(batch.t).squeeze(1)
        
            criterion.weight = class_weights[batch.l.long()]
            
            loss = criterion(predictions, batch.l)

            epoch_loss += loss.item()
        
            prediction.append(
                torch.sigmoid(predictions).round().detach().cpu()
            )
            label.append(batch.l.detach().cpu())
            
    prediction = torch.cat(prediction).numpy()
    label = torch.cat(label).numpy()
    
    f1 = f1_score(label, prediction)
        
    return epoch_loss / len(iterator), f1

In [69]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_f1 = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_f1 = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train f1: {train_f1*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. f1: {valid_f1*100:.2f}% |')

100%|██████████| 10010/10010 [02:30<00:00, 66.45it/s]
100%|██████████| 4290/4290 [00:25<00:00, 168.60it/s]
  0%|          | 6/10010 [00:00<03:08, 53.05it/s]

| Epoch: 01 | Train Loss: 0.024 | Train f1: 95.92% | Val. Loss: 0.529 | Val. f1: 81.54% |


100%|██████████| 10010/10010 [02:30<00:00, 66.44it/s]
100%|██████████| 4290/4290 [00:25<00:00, 170.37it/s]
  0%|          | 5/10010 [00:00<03:34, 46.62it/s]

| Epoch: 02 | Train Loss: 0.010 | Train f1: 98.14% | Val. Loss: 1.220 | Val. f1: 80.71% |


100%|██████████| 10010/10010 [02:31<00:00, 66.05it/s]
100%|██████████| 4290/4290 [00:25<00:00, 168.26it/s]
  0%|          | 5/10010 [00:00<03:40, 45.32it/s]

| Epoch: 03 | Train Loss: 0.010 | Train f1: 99.19% | Val. Loss: 0.394 | Val. f1: 84.16% |


100%|██████████| 10010/10010 [02:31<00:00, 66.06it/s]
100%|██████████| 4290/4290 [00:24<00:00, 175.19it/s]
  0%|          | 5/10010 [00:00<03:32, 47.00it/s]

| Epoch: 04 | Train Loss: 0.005 | Train f1: 99.25% | Val. Loss: 0.359 | Val. f1: 83.09% |


100%|██████████| 10010/10010 [02:30<00:00, 66.69it/s]
100%|██████████| 4290/4290 [00:25<00:00, 171.09it/s]

| Epoch: 05 | Train Loss: 0.000 | Train f1: 100.00% | Val. Loss: 1.033 | Val. f1: 83.10% |





In [70]:
test_loss, test_f1 = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test f1: {test_f1*100:.2f}% |')


100%|██████████| 3300/3300 [00:18<00:00, 179.20it/s]

| Test Loss: 1.040 | Test f1: 86.06% |





In [71]:
torch.save({
            'state_dict': model.state_dict(),
            'optimizer' : optimizer.state_dict()
            },
    'lstm_data/models/alstm_1_14_86.06_overfit.pth.tar')

In [1]:
import torch

In [2]:
x1 = torch.randn(2,2)

In [3]:
x2 = torch.randn(2,1)

In [4]:
x1, x2

(tensor([[-0.7026, -0.0826],
         [-0.1960,  0.6961]]), tensor([[-0.1417],
         [ 1.5052]]))

In [5]:
x1*x2

tensor([[ 0.0995,  0.0117],
        [-0.2950,  1.0477]])

In [7]:
import torch.nn.functional as F

In [8]:
x1.unsqueeze_(2)

tensor([[[-0.7026],
         [-0.0826]],

        [[-0.1960],
         [ 0.6961]]])

In [9]:
F.softmax(x1, dim=1)

tensor([[[0.3498],
         [0.6502]],

        [[0.2907],
         [0.7093]]])