# Model's input

In [17]:
#from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from pytorch_pretrained_bert import BertModel
from torch.utils.data import Dataset
import pandas as pd
import logging
import argparse
import math
import os
from time import strftime, localtime
import random
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, random_split

### Tokenize for Bert

In [18]:
from pytorch_pretrained_bert import BertTokenizer
import numpy as np


class Tokenizer4Bert:
    def __init__(self, max_seq_len, pretrained_bert_name , max_num_chunks ):
        # Load pretrained model/tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
        self.max_seq_len = max_seq_len
        self.max_num_chunks = max_num_chunks

    def long_text_to_chunks(self, text):
        """return an array with shape of [30, max_seq_len], and the element is the token representation of the BERT"""
        import numpy as np
        ls_of_tokens = self.tokenizer.tokenize(text)
        #print('ls of tokens: {},len:{}'.format(ls_of_tokens, len(ls_of_tokens)))
        n = len(ls_of_tokens) // self.max_seq_len
        #print('n: {} '.format(n))
        res = []
        for i in range(self.max_num_chunks):
            if i < n:
                sub_ls_of_tokens = ls_of_tokens[i*self.max_seq_len:i*self.max_seq_len + self.max_seq_len]
            elif i == n:
                tmp_len = len(ls_of_tokens[i*self.max_seq_len:])
                sub_ls_of_tokens = ls_of_tokens[i*self.max_seq_len:]+['[PAD]']*(self.max_seq_len-tmp_len)
            else:
                sub_ls_of_tokens = ['[PAD]']*self.max_seq_len
            # convert ls of toens to sequence of ids
            sub_ls_of_tokens = self.tokenizer.convert_tokens_to_ids(sub_ls_of_tokens)
            res.append(sub_ls_of_tokens)
        return np.array(res)

    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        sequence = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
        #print("seq: {}, len:{}".format(sequence,len(sequence)))
        if len(sequence) == 0:
            sequence = [0]
            print('seq:{}'.format(sequence))
        if reverse:
            sequence = sequence[::-1]
            print('seq:{}'.format(sequence))
        print(len(self.pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)))

        return self.pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)

    def pad_and_truncate(self, sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
        import numpy as np
        x = (np.ones(maxlen) * value).astype(dtype)
        if truncating == 'pre':
            trunc = sequence[-maxlen:]
        else:
            trunc = sequence[:maxlen]
        trunc = np.asarray(trunc, dtype=dtype)
        if padding == 'post':
            x[:len(trunc)] = trunc
        else:
            x[-len(trunc):] = trunc
        return x



In [19]:
class ABSADataset(Dataset):
    def __init__(self, fname, tokenizer):
        fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        lines = fin.readlines()[:100] # 先看前面50筆 ！！！！！！
        fin.close() 
        print(len(lines))
        all_data = []
        for i in range(0, len(lines), 2):
            text = lines[i].strip()
            polarity = lines[i + 1].strip()
            # single-sentence classification
            text_raw_bert_indices = tokenizer.text_to_sequence("[CLS] " + text)
            # documentation classification
            text_raw_bert_documents = tokenizer.long_text_to_chunks("[CLS] " + text)
            # label
            polarity = int(polarity)  # range betwee 0 to num_class -1

            data = {
                'text_raw_bert_indices': text_raw_bert_indices,
                'text_raw_bert_documents': text_raw_bert_documents,
                'polarity': polarity,
            }
            all_data.append(data)

        self.data = all_data

    def get_dataframe(self, tokenizer):
        """
        Conver dataset into DataFrame(Pandas)
        It's only support for bert-based model.
        """
        df = []
        columns_name = []
        for i in range(len(self.data)):
            tmp = []
            for k, v in self.data[i].items():
                try:
                    to_str = " ".join(tokenizer.tokenizer.convert_ids_to_tokens(v))
                    tmp.append(to_str)
                except:
                    if k == 'aspect_in_text':
                        # it's a 1-D tensor wtih shape of (2,), representing the start and end index of the aspect
                        v = v.numpy()  # 1-D tensor
                        #print (v.shape)
                    tmp.append(v)
                if i <= 0:
                    columns_name.append(k)
            df.append(tmp)
        df = pd.DataFrame(df,columns=columns_name)
        return df

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

In [20]:
max_seq_len = 100
pretrained_bert_name = "bert-base-uncased"
max_num_chunks = 3
train_path = 'data/train.txt'
test_path = 'data/test.txt'
tokenizer = Tokenizer4Bert(max_seq_len, pretrained_bert_name, max_num_chunks)

In [21]:
trainset = ABSADataset(train_path, tokenizer)
testset = ABSADataset(test_path, tokenizer)
print ('number of traning data', len(trainset))
print ('number of testing data', len(testset))

Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (626 > 512). Running this sequence through BERT will result in indexing errors


100
100
100
100
100
100
100
100


Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (1117 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (756 > 512). Running this sequence through BERT will result in indexing errors


100
100
100
100


Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (728 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (735 > 512). Running this sequence through BERT will result in indexing errors


100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100


Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (690 > 512). Running this sequence through BERT will result in indexing errors


100
100
100
100
100
100
100
100


Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (566 > 512). Running this sequence through BERT will result in indexing errors


100
100
100
100
100
100
40
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
number of traning data 50
number of testing data 20


In [26]:
train_df = trainset.get_dataframe(tokenizer)
test_df = testset.get_dataframe(tokenizer)

In [27]:
len(test_df), len(train_df)

(20, 50)

In [28]:
train_df.isna().sum()

text_raw_bert_indices      0
text_raw_bert_documents    0
polarity                   0
dtype: int64

In [29]:
trainset

<__main__.ABSADataset at 0x128bf46d8>

### Hyper-parameters

In [129]:
# Model Parameters for train
learning_rate = 2e-5
num_epoch = 10
batch_size = 64
log_step = 5
valset_ratio = 0
get_tokenized_result = True

In [130]:
# Model Parameters for predict
# model_name = 'bert_ssc'
pretrained_bert_name = 'bert-base-uncased'
dropout = 0.1
bert_dim = 768
polarities_dim = 2
device = None
state_dict_path = "artifacts/bert_ssc_val_acc"

### Model
- single sentence classification

In [131]:
import torch
import torch.nn as nn


class BERT_SSC(nn.Module):
    """single sentence classification"""
    def __init__(self, bert, opt):
        super(BERT_SSC, self).__init__()
        # self.squeeze_embedding = SqueezeEmbedding()
        self.bert = bert
        self.dropout = nn.Dropout(opt.dropout)
        self.dense = nn.Linear(opt.bert_dim, opt.polarities_dim)

    def forward(self, inputs):
        text_bert_indices = inputs[0]
        _, pooled_output = self.bert(text_bert_indices, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.dense(pooled_output)
        return logits
        

In [132]:
import torch.nn as nn

# Loss and Optimizer
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
bert = BertModel.from_pretrained(pretrained_bert_name)
criterion = nn.CrossEntropyLoss().to(device)
print ("loss",criterion)

class objectview(object):
    def __init__(self, d):
        self.__dict__ = d
        
opt = {
    "model_name": 'bert_ssc',
    "device":device,
    "log_step": 5,
    "dropout":0.1,
    "hidden_dim":768,
    "bert_dim":768,
    "polarities_dim":2,
    "learning_rate":2e-5,
    "l2reg":0.01,
    "num_epoch":10,
    "batch_size":64,
    "optimizer":torch.optim.Adam,
    "inputs_cols":['text_raw_bert_indices']
}
opt = objectview(opt)

model = BERT_SSC(bert,opt)
_params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = opt.optimizer(_params, lr=opt.learning_rate, weight_decay=opt.l2reg)
print ("optimizer", optimizer)

loss CrossEntropyLoss()
optimizer Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 2e-05
    weight_decay: 0.01
)


In [133]:
help(model)

Help on BERT_SSC in module __main__ object:

class BERT_SSC(torch.nn.modules.module.Module)
 |  single sentence classification
 |  
 |  Method resolution order:
 |      BERT_SSC
 |      torch.nn.modules.module.Module
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, bert, opt)
 |      Initializes internal Module state, shared by both nn.Module and ScriptModule.
 |  
 |  forward(self, inputs)
 |      Defines the computation performed at every call.
 |      
 |      Should be overridden by all subclasses.
 |      
 |      .. note::
 |          Although the recipe for forward pass needs to be defined within
 |          this function, one should call the :class:`Module` instance afterwards
 |          instead of this since the former takes care of running the
 |          registered hooks while the latter silently ignores them.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from torch.nn.modules.module.Modul

### Training precedure

In [134]:
from torch.utils.data import DataLoader, random_split
valset = testset 
train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True)
test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)
val_data_loader = DataLoader(dataset=valset, batch_size=opt.batch_size, shuffle=False)

In [135]:
trainset

<__main__.ABSADataset at 0x13d06e7b8>

In [136]:
def _evaluate_acc_f1(data_loader):
    n_correct, n_total = 0, 0
    t_targets_all, t_outputs_all = None, None
    # switch model to evaluation mode
    model.eval()
    with torch.no_grad():
        for t_batch, t_sample_batched in enumerate(data_loader):
            t_inputs = [t_sample_batched[col].to(opt.device) for col in opt.inputs_cols]
            t_targets = t_sample_batched['polarity'].to(opt.device)
            t_outputs = model(t_inputs)

            n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item()
            n_total += len(t_outputs)

            if t_targets_all is None:
                t_targets_all = t_targets
                t_outputs_all = t_outputs
            else:
                t_targets_all = torch.cat((t_targets_all, t_targets), dim=0)
                t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0)

    acc = n_correct / n_total
    f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[i for i in range(opt.polarities_dim)], average='macro')
    precision = precision_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[i for i in range(opt.polarities_dim)], average='macro')
    recall = recall_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[i for i in range(opt.polarities_dim)], average='macro')
    return acc, f1, precision, recall

In [None]:
max_val_acc = 0
max_val_f1 = 0
global_step = 0
path = None
for epoch in range(opt.num_epoch):
    print('>' * 100)
    print('epoch: {}'.format(epoch))
    n_correct, n_total, loss_total = 0, 0, 0
    # switch model to training mode
    model.train()
    for i_batch, sample_batched in enumerate(train_data_loader):
        if i_batch <= 0 :
            print (type(sample_batched))
        global_step += 1
        # clear gradient accumulators
        optimizer.zero_grad()

        inputs = [sample_batched[col].to(opt.device) for col in opt.inputs_cols]
        if i_batch <= 0 :
            print ("=" * 100)
            print ("inputs")
            print (len(inputs))
            print (inputs)
        outputs = model(inputs)
        if i_batch <= 0 :
            print ("=" * 100)
            print ("outputs")
            print (len(outputs))
            print (outputs)

        targets = sample_batched['polarity'].to(opt.device)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
        n_total += len(outputs)
        loss_total += loss.item() * len(outputs)
        if global_step % opt.log_step == 0:
            train_acc = n_correct / n_total
            train_loss = loss_total / n_total
            logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc))

    val_acc, val_f1, val_p, val_r = _evaluate_acc_f1(val_data_loader)
    print('> val_acc: {:.4f}, val_f1: {:.4f}, val_p: {:.4f}, val_r: {:.4f}'.format(val_acc, val_f1, val_p, val_r))
    #logger.info('> val_acc: {:.4f}, val_f1: {:.4f}, val_p: {:.4f}, val_r: {:.4f}'.format(val_acc, val_f1, val_p, val_r))
    if val_acc > max_val_acc:
        max_val_acc = val_acc
        if not os.path.exists('state_dict'):
            os.mkdir('state_dict')
        path = 'state_dict/{0}_val_acc{2}'.format(opt.model_name, round(val_acc, 4))
        torch.save(model.state_dict(), path)
        logger.info('>> saved: {}'.format(path))        
    if val_f1 > max_val_f1:
        max_val_f1 = val_f1
    print ("max_val_acc:{}, max_val_f1:{}".format(max_val_acc, max_val_f1))

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 0
<class 'dict'>
inputs
1
[tensor([[  101,  2899,  1006,  ..., 11670,  3228,  4855],
        [  101,  2899,  1006,  ...,  5290,  1005,  3177],
        [  101,  2089,  2343,  ...,  2190,  2240,  2154],
        ...,
        [  101,  2406,  3220,  ...,  3971,  1024, 11562],
        [  101, 20312,  1006,  ...,     0,     0,     0],
        [  101,  2343,  8398,  ...,  8398,  2134,  1521]])]
outputs
50
tensor([[-0.1916, -0.2758],
        [-0.1141, -0.1732],
        [-0.1051, -0.2890],
        [-0.0659, -0.2030],
        [ 0.0780, -0.1888],
        [ 0.0760, -0.1320],
        [-0.1432, -0.0323],
        [-0.0042, -0.2424],
        [-0.2997,  0.0013],
        [-0.2493, -0.1699],
        [-0.1241, -0.1292],
        [-0.1571, -0.1011],
        [-0.3361, -0.1103],
        [-0.1150, -0.1287],
        [-0.1137, -0.2442],
        [ 0.1023, -0.2832],
        [ 0.0375, -0.4481],
        [ 0.050