<a href="https://colab.research.google.com/github/rishubhkhurana/nlp/blob/main/sentiment/SimpleSentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Simple sentiment classifier

1. Pretrained embedding

2. LSTM layers -- 3

3. All bi-directional layers

4. packed padded sequences

## BookKeeping

In [1]:
!pip install GPUtil

Collecting GPUtil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7411 sha256=c938cbe32f95d1c54cf458ff24418fad8dc46c16a6bceb41e0ea63dc12852120
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0


## Importing Libs

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor
import torchtext
from torchtext.data import Field,LabelField
from torchtext.datasets import IMDB
from torchtext import data
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence
import random
import torch.optim as optim
from fastprogress import master_bar, progress_bar
from GPUtil import showUtilization

## Downloading Data

In [21]:
SEED=1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [22]:
TEXT = Field(tokenize='spacy',include_lengths=True)
LABEL = LabelField(dtype=torch.float)

In [166]:
train_data,test_data = IMDB.splits(TEXT,LABEL)

In [167]:
print(' '.join(vars(train_data.examples[0]).get('text')))

A great storyline with a message . Joan Plowright is superb as " Phoebe " , Mike Kopsa is hilarious as " coach " and Richard de Klerk plays the role of " Carmine " superbly . Mischa Barton as " Frankie " puts in a good performance and Ingrid as " Hazel " plays her first lead extremely well . This film is superbly directed by Jo - Beth Williams . The editing is first rate .


In [169]:
# reverse the data
for i in range(len(train_data)):
    vars(train_data.examples[i]).get('text').reverse()

In [170]:
print(' '.join(vars(train_data.examples[0]).get('text')))

. rate first is editing The . Williams Beth - Jo by directed superbly is film This . well extremely lead first her plays " Hazel " as Ingrid and performance good a in puts " Frankie " as Barton Mischa . superbly " Carmine " of role the plays Klerk de Richard and " coach " as hilarious is Kopsa Mike , " Phoebe " as superb is Plowright Joan . message a with storyline great A


In [171]:
print(len(train_data),len(test_data))

25000 25000


In [172]:
print(vars(train_data.examples[0]))

{'text': ['.', 'rate', 'first', 'is', 'editing', 'The', '.', 'Williams', 'Beth', '-', 'Jo', 'by', 'directed', 'superbly', 'is', 'film', 'This', '.', 'well', 'extremely', 'lead', 'first', 'her', 'plays', '"', 'Hazel', '"', 'as', 'Ingrid', 'and', 'performance', 'good', 'a', 'in', 'puts', '"', 'Frankie', '"', 'as', 'Barton', 'Mischa', '.', 'superbly', '"', 'Carmine', '"', 'of', 'role', 'the', 'plays', 'Klerk', 'de', 'Richard', 'and', '"', 'coach', '"', 'as', 'hilarious', 'is', 'Kopsa', 'Mike', ',', '"', 'Phoebe', '"', 'as', 'superb', 'is', 'Plowright', 'Joan', '.', 'message', 'a', 'with', 'storyline', 'great', 'A'], 'label': 'pos'}


In [26]:
train_data,valid_data = train_data.split(random_state = random.seed(SEED))

In [27]:
len(train_data)

17500

## Data processing

In [173]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,max_size=MAX_VOCAB_SIZE, vectors = 'glove.6B.100d',unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [174]:
print(len(TEXT.vocab))
print(len(LABEL.vocab))

25002
2


In [175]:
print(TEXT.vocab.freqs.most_common(2))

[('the', 289838), (',', 275296)]


In [176]:
type(TEXT.vocab.freqs)

collections.Counter

In [177]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [178]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f4becbfa510>, {'neg': 0, 'pos': 1})


## Data Loader

In [179]:
## data loaders
BATCH_SIZE = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
trn_dl, val_dl, test_dl = data.BucketIterator.splits((train_data,valid_data,test_data),batch_size=BATCH_SIZE, device = device, sort_within_batch=True)

## Model 

In [206]:
class mLSTM(nn.Module):
    def __init__(self,hidden_dims,embed_dims, vocab_size,output_dim, n_layers = 3, p = 0.2,padding_idx=1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size,embed_dims,padding_idx=padding_idx)
        self.n_layers = n_layers
        self.rnn = nn.ModuleList()
        for i in range(n_layers):
            self.rnn.append(nn.LSTM(embed_dims if i==0 else 2*hidden_dims, hidden_dims,num_layers = 1,bidirectional = True))
        self.dropout = nn.Dropout(p)
        self.output_layer = nn.Linear(2*hidden_dims,output_dim)
    
    def forward(self,text,lengths):
        # [seq_len,B]
        embedded = self.embed(text)
        # [seq_len,B,embed_dims]
        packed_output = pack_padded_sequence(embedded,lengths.cpu())
        for i,lyr in enumerate(self.rnn):
            packed_output, (h,c) = lyr(packed_output)
            padded_sequence,lengths = pad_packed_sequence(packed_output,padding_value=1)
            padded_sequence = self.dropout(padded_sequence)
            packed_output = pack_padded_sequence(padded_sequence,lengths.cpu())
            #packed_output.data = self.dropout(packed_output.data)
        # h --> [num_directions,B,hidden_dims]
        out = self.dropout(torch.cat([h[-2,:,:],h[-1,:,:]],dim=1))
        # out --> [B,2*hidden_dims]
        out = self.output_layer(out)
        return out


In [207]:
model = mLSTM(256,100,len(TEXT.vocab),1,padding_idx = TEXT.vocab.stoi[TEXT.pad_token]).cuda()

## Training Loop

In [136]:
class Accuracy(object):
    def __init__(self,summable=True,reduce=True,logits=True):
        self.reduce=True
        self.summable=summable
        self.logits=logits
    def __call__(self,preds,targets):
        if isinstance(preds,torch.Tensor):

            if preds.shape[-1]>1:
                preds = torch.argmax(preds,dim=-1)
            else:
                if self.logits:
                    preds = torch.round(torch.sigmoid(preds))
                else:
                    preds = torch.round(preds)

            return preds.eq(targets.view_as(preds)).float().mean()


In [195]:
class Recorder(object):

    def __init__(self,record_batch=True,record_epoch=True,track_loss=True,metric='accuracy'):
        self.record_batch = record_batch
        self.record_epoch = record_epoch
        self.metric = metric
        self.reset()

    def reset(self):
        self.batch_stats = dict(losses=[],metrics=[])
        self.epoch_stats = dict(losses=[],metrics=[])
        self.batch_count = []

    def update_batch(self,loss,metric,batch_size):
        self.batch_stats['losses'].append(loss)
        self.batch_stats['metrics'].append(metric)
        self.batch_count.append(batch_size)
    
    def update_epoch(self):
        n_batches = len(self.batch_count)
        epoch_loss = sum([loss*bs for loss,bs in zip(self.batch_stats['losses'][-n_batches:],self.batch_count)])/sum(self.batch_count)
        epoch_metric = sum([metric*bs for metric,bs in zip(self.batch_stats['metrics'][-n_batches:],self.batch_count)])/sum(self.batch_count)
        self.epoch_stats['losses'].append(epoch_loss)
        self.epoch_stats['metrics'].append(epoch_metric)
        if not self.record_batch:
            self.batch_stats = dict(losses=[],metrics=[])
        self.batch_count=[]
    
    def get_epoch_stats(self):
        return_str = f'Loss: {self.epoch_stats["losses"][-1]:.3f}, {self.metric.capitalize()}: {self.epoch_stats["metrics"][-1]:.3f}'
        return return_str

def one_batch(model,opt,loss_func,batch,device='cpu',train=True,metric_func=None):        
    xb,yb = batch.text,batch.label
    if not isinstance(xb,(list,tuple)):
        xb=[xb]
    xb,yb = [t.to(device) for t in xb],yb.to(device)
    preds = model(*xb)
    if isinstance(loss_func,nn.CrossEntropyLoss):
        loss = loss_func(preds,yb)
    else:
        loss = loss_func(preds.view_as(yb),yb)
    if train:
        loss.backward()
        opt.step()
        opt.zero_grad()
    if metric_func is not None:
        metric = metric_func(preds.cpu(),yb.cpu())
    else:
        metric = None
    return loss.cpu().item(),metric.cpu().item()

def trainEpoch(model,opt,loss_func,dl,device='cpu',recorder=None,metric_func=None, parent=None): 
    if recorder is None:
        raise RuntimeError("Please pass the recorder")
    # training the model
    pb = progress_bar(dl,total=len(dl),parent=parent)
    for batch in pb:
        loss,metric = one_batch(model,opt,loss_func,batch,device=device,train=True,metric_func=metric_func)
        parent.child.comment = f'Loss: {loss:.3f}, {metric_func.__class__.__name__}: {metric:.3f}'
        recorder.update_batch(loss,metric,batch.label.shape[0])
        del batch

def evalEpoch(model,opt,loss_func,dl,device='cpu',recorder=None,metric_func=None, parent=None): 
    if recorder is None:
        raise RuntimeError("Please pass the recorder")
    #evaluating the model
    with torch.no_grad():
        pb = progress_bar(dl,total=len(dl),parent=parent)
        for batch in pb:
            loss,metric = one_batch(model,opt,loss_func,batch,device=device,train=False,metric_func=metric_func)
            parent.child.comment = f'Loss: {loss:.3f}, {metric_func.__class__.__name__}: {metric:.3f}'
            recorder.update_batch(loss,metric,batch.label.shape[0])

def testEpoch(model,opt,loss_func,dl,device='cpu',recorder=None,metric_func=None, parent=None): 
    if recorder is None:
        raise RuntimeError("Please pass the recorder")
    # evaluating the model
    with torch.no_grad():
        pb = progress_bar(dl,total=len(dl),parent=parent)
        for batch in pb:
            loss,metric = one_batch(model,opt,oss_func,batch,device=device,train=False)
            recorder.update_batch(loss,metric,batch.label.shape[0])

            
def trainModel(model,opt,loss_func,dls,NEpochs=50,device='cpu',metric_func=None):
    if metric_func is None:
        raise RuntimeError("Please pass the metric function")
    mb = master_bar(range(NEpochs),total=NEpochs)
    trainRecorder = Recorder()
    valRecorder = Recorder()
    #trainRecorder = None
    #valRecorder = None
    for epoch in mb:
        model.train()
        trainEpoch(model,opt,loss_func,dls['train'],device=device,metric_func=metric_func,parent=mb,recorder = trainRecorder)
        model.eval()
        evalEpoch(model,opt,loss_func,dls['valid'],device=device,metric_func=metric_func,parent=mb,recorder = valRecorder)
        trainRecorder.update_epoch()
        valRecorder.update_epoch()
        mb.write(f'Epoch[{epoch}]--> Training Stats: {trainRecorder.get_epoch_stats()}, Validation Stats: {valRecorder.get_epoch_stats()}')
        print(showUtilization())
    return trainRecorder,valRecorder

def testModel(model,opt,loss_func,dls,NEpochs=50,device='cpu',metric_func=None):
    if metric_func is None:
        raise RuntimeError("Please pass the metric function")
    testRecorder = Recorder()
    model.eval()
    mb = master_bar(range(NEpochs),total=1)
    for epoch in mb:
        evalEpoch(model,opt,loss_func,dls['test'],device=device,metric_func=metric_func,parent=mb,recorder = testRecorder)
        mb.write(f'Epoch[{epoch}]--> Testing Stats: {testRecorder.get_epoch_stats()}')
    
    return testRecorder





## Training Simple model

In [208]:
model_params = {}
model_params['hidden_dims'] = 256
model_params['vocab_size'] = len(TEXT.vocab)
model_params['output_dim'] = 1
model_params['embed_dims'] = 100
model_params['padding_idx'] = TEXT.vocab.stoi[TEXT.pad_token]
model_params['p'] = 0.2
model_params['n_layers'] = 3
model = mLSTM(**model_params).to('cuda')


In [209]:

model.embed.weight.data.copy_(TEXT.vocab.vectors)
model.embed.weight.data[TEXT.vocab.stoi[TEXT.pad_token]] = torch.zeros(model_params['embed_dims'])
model.embed.weight.data[TEXT.vocab.stoi[TEXT.unk_token]] = torch.zeros(model_params['embed_dims'])

In [210]:
model.embed.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.9012,  0.7192,  1.2343,  ..., -1.6855,  0.7152, -0.7761],
        [ 0.0210,  2.1859,  0.2976,  ..., -0.9113,  0.0041,  0.2354],
        [-0.4992,  1.0342,  0.4681,  ...,  0.9418, -1.5903,  1.2315]],
       device='cuda:0')

In [211]:
opt = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
dls = {'train':trn_dl,'valid':val_dl,'test':test_dl}
metric_func = Accuracy()

In [212]:
trecorder, vrecorders = trainModel(model,opt,criterion,dls,device='cuda',metric_func=metric_func,NEpochs=5)

| ID | GPU | MEM |
------------------
|  0 | 56% | 70% |
None
| ID | GPU | MEM |
------------------
|  0 | 54% | 70% |
None
| ID | GPU | MEM |
------------------
|  0 | 87% | 70% |
None
| ID | GPU | MEM |
------------------
|  0 | 32% | 70% |
None
| ID | GPU | MEM |
------------------
|  0 | 45% | 70% |
None


In [213]:
vrecorders.get_epoch_stats()

'Loss: 0.189, Accuracy: 0.930'

In [214]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [215]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [216]:
test_loss, test_acc = evaluate(model, dls['test'], criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.378 | Test Acc: 85.54%
