In [1]:
!pip install torch torchtext tqdm



# SET LIBRARY

In [2]:
import os, time, sys
from glob import glob
import numpy as np
import pandas as pd
import torch

from torchtext import data, datasets
from torchtext.vocab import GloVe, FastText, CharNGram
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import matplotlib.pyplot as plt
%matplotlib inline

# LOAD DATA

In [None]:
#!pwd
#!tar xvzf nmr-100k.tar.gz

In [3]:
tab = pd.read_csv('/jet/prs/workspace/nmr-100k.csv',delimiter='|')

In [4]:
# SANITY CHECK
print(len(tab.loc[0,'rviw_modd'].split()))

50


In [5]:
tab['label'] = np.where(tab.rviw_rate>7, "pos", np.where(tab.rviw_rate==1, "neg", "neu"))
tab['input'] = tab.rviw_modd

In [6]:
pd.crosstab(tab.is_train, tab.label, margins=False).apply(lambda r: round(r/r.sum(),3), axis=1)

label,neg,neu,pos
is_train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.111,0.186,0.703
True,0.1,0.189,0.712


In [None]:
tab[tab.is_train==True][tab.label.isin(['pos','neg'])].loc[:,['label','input']].to_json('train.json',orient='records',lines=True) #.sample(frac=0.1, replace=False)
tab[tab.is_train!=True][tab.label.isin(['pos','neg'])].loc[:,['label','input']].to_json('valid.json',orient='records',lines=True) #.sample(frac=0.1, replace=False)

In [7]:
INPUT = data.Field(fix_length=50, batch_first=True)
LABEL = data.Field(sequential=False,)

fields = {'label': ('label', LABEL), 'input': ('input', INPUT)}

train, valid = data.TabularDataset.splits(
    path = '/jet/prs/workspace',
    train = 'train.json',
    test = 'valid.json',
    format = 'json',
    fields = fields
)

In [8]:
print(vars(train[0]))

{'label': 'pos', 'input': ['24151', '49437', '14146', '29768', '16450', '12248', '12248', '24151', '49437', '44879', '18715', '51458', '43907', '20743', '24116', '27813', '18196', '48998', '43703', '34396', '6247', '40398', '30707', '24173', '7767', '35830', '43062', '56983', '6322', '48998', '21323', '14146', '52824', '10', '23685', '52516', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']}


In [9]:
INPUT.build_vocab(train, vectors=GloVe(name='6B', dim=300), max_size=10000, min_freq=10)
LABEL.build_vocab(train,)

In [10]:
g_train, g_valid = data.BucketIterator.splits((train, valid), batch_size=32, device=-1, shuffle=True, sort=False)
g_train.repeat = False
g_valid.repeat = False
dataloader = {'train':g_train, 'valid':g_valid}
dataset_sizes = {'train':len(g_train.dataset),'valid':len(g_valid.dataset)}

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [11]:
# SANITY CHECK
batch = next(iter(dataloader['train']))
x_input = batch.input.cuda()
y_label = batch.label.cuda()
print(x_input.size())
print(y_label.size())

torch.Size([32, 50])
torch.Size([32])


In [12]:
class Gph(nn.Module):
        
    def __init__(self, n_vocab, n_hidden, n_label, btch_size=1, kernel_size=3, max_len=50):
        super().__init__()
        self.n_hidden = n_hidden
        self.btch_size = btch_size
        self.embd = nn.Embedding(n_vocab, n_hidden)
        self.conv1d = nn.Conv1d(max_len, n_hidden, kernel_size)
        self.pool = nn.AdaptiveAvgPool1d(10)
        self.fcn = nn.Linear(1000, n_label)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, x_input):
        btch_size = x_input.size()[0]
        if btch_size != self.btch_size:
            self.btch_size = btch_size
        embd = self.embd(x_input)
        conv1d = self.conv1d(embd) 
        pool = self.pool(conv1d)
        flat = pool.view(self.btch_size, -1)
        fcn = F.dropout(self.fcn(flat), p=0.5)
        return self.softmax(fcn)

# SET PARMS

In [13]:
n_vocab = len(INPUT.vocab)
n_hidden = 100

In [14]:
gph = Gph(n_vocab, n_hidden, n_label=3, btch_size=32, kernel_size=2)
if torch.cuda.is_available():
    gph = gph.cuda()

In [15]:
learning_rate = 0.001
optimizer = optim.Adam(gph.parameters(), lr=learning_rate)

#gph.embd.weight.requires_grad = False
#optimizer = optim.SGD([parm for parm in gph.parameters() if parm.requires_grad==True], lr=learning_rate)

In [16]:
def train_gph(gph, optimizer, dataloader, phase='train', volatile=False):
    if phase == 'train':
        gph.train()
    if phase == 'valid':
        gph.eval()
        volatile = True

    running_loss = 0.0
    running_corrects = 0
    running_counts = 0

    for idx, batch in enumerate(dataloader[phase]):
        if torch.cuda.is_available():
            x_input, y_label = batch.input.cuda(), batch.label.cuda()

        if phase == 'train':
            optimizer.zero_grad()
        y_prob = gph(x_input)
        _, y_pred = torch.max(y_prob, 1)
        loss = F.nll_loss(y_prob, y_label)

        if phase == 'train':
            loss.backward()
            optimizer.step()

        running_loss += loss.item()
        running_corrects += torch.sum(y_pred == y_label.data).item()
        running_counts += len(y_pred)

    epoch_loss = running_loss / running_counts
    epoch_acc = running_corrects / running_counts
    print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
    return epoch_loss, epoch_acc

In [17]:
# DEBUGGED
# https://discuss.pytorch.org/t/data-iterator-failing-on-dev-set-only/11956/4
train_loss, train_acc = [],[]
valid_loss, valid_acc = [],[]
for epoch in range(1,16):
    epoch_loss, epoch_acc = train_gph(gph, optimizer, dataloader, phase='train')
    vld_loss, vld_acc = train_gph(gph, optimizer, dataloader, phase='valid')
    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    valid_loss.append(vld_loss)
    valid_acc.append(vld_acc)

train Loss: 0.0053 Acc: 0.9365
valid Loss: 0.0059 Acc: 0.9281
train Loss: 0.0049 Acc: 0.9409
valid Loss: 0.0060 Acc: 0.9280
train Loss: 0.0049 Acc: 0.9417
valid Loss: 0.0060 Acc: 0.9276
train Loss: 0.0048 Acc: 0.9422
valid Loss: 0.0060 Acc: 0.9278
train Loss: 0.0048 Acc: 0.9428
valid Loss: 0.0058 Acc: 0.9283
train Loss: 0.0048 Acc: 0.9432
valid Loss: 0.0060 Acc: 0.9281
train Loss: 0.0047 Acc: 0.9437
valid Loss: 0.0059 Acc: 0.9296
train Loss: 0.0047 Acc: 0.9442
valid Loss: 0.0059 Acc: 0.9300
train Loss: 0.0046 Acc: 0.9448
valid Loss: 0.0059 Acc: 0.9293
train Loss: 0.0046 Acc: 0.9452
valid Loss: 0.0061 Acc: 0.9292
train Loss: 0.0046 Acc: 0.9457
valid Loss: 0.0061 Acc: 0.9280
train Loss: 0.0045 Acc: 0.9460
valid Loss: 0.0061 Acc: 0.9287
train Loss: 0.0045 Acc: 0.9465
valid Loss: 0.0062 Acc: 0.9278
train Loss: 0.0045 Acc: 0.9469
valid Loss: 0.0062 Acc: 0.9278
train Loss: 0.0045 Acc: 0.9471
valid Loss: 0.0063 Acc: 0.9285
