# Model's input

In [1]:
pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 21.6MB/s eta 0:00:01[K     |█████▎                          | 20kB 1.8MB/s eta 0:00:01[K     |████████                        | 30kB 2.3MB/s eta 0:00:01[K     |██████████▋                     | 40kB 1.7MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 1.9MB/s eta 0:00:01[K     |███████████████▉                | 61kB 2.3MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 2.5MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 2.6MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 2.9MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 2.9MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 2.9MB/s eta 0:00:01[K     |██████████████████████

In [0]:
#from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from pytorch_pretrained_bert import BertModel
from torch.utils.data import Dataset
import pandas as pd
import logging
import argparse
import math
import os
from time import strftime, localtime
import random
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm

### Tokenize for Bert

In [0]:
from pytorch_pretrained_bert import BertTokenizer
import numpy as np



class Tokenizer4Bert:
    def __init__(self, max_seq_len, pretrained_bert_name , max_num_chunks ):
        # Load pretrained model/tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
        self.max_seq_len = max_seq_len
        self.max_num_chunks = max_num_chunks

    def long_text_to_chunks(self, text):
        """return an array with shape of [30, max_seq_len], and the element is the token representation of the BERT"""
        import numpy as np
        ls_of_tokens = self.tokenizer.tokenize(text)
        #print('ls of tokens: {},len:{}'.format(ls_of_tokens, len(ls_of_tokens)))
        n = len(ls_of_tokens) // self.max_seq_len
        #print('n: {} '.format(n))
        res = []
        for i in range(self.max_num_chunks):
            if i < n:
                sub_ls_of_tokens = ls_of_tokens[i*self.max_seq_len:i*self.max_seq_len + self.max_seq_len]
            elif i == n:
                tmp_len = len(ls_of_tokens[i*self.max_seq_len:])
                sub_ls_of_tokens = ls_of_tokens[i*self.max_seq_len:]+['[PAD]']*(self.max_seq_len-tmp_len)
            else:
                sub_ls_of_tokens = ['[PAD]']*self.max_seq_len
            # convert ls of toens to sequence of ids
            sub_ls_of_tokens = self.tokenizer.convert_tokens_to_ids(sub_ls_of_tokens)
            res.append(sub_ls_of_tokens)
        return np.array(res)

    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        sequence = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
        #print("seq: {}, len:{}".format(sequence,len(sequence)))
        if len(sequence) == 0:
            sequence = [0]
            print('seq:{}'.format(sequence))
        if reverse:
            sequence = sequence[::-1]
            print('seq:{}'.format(sequence))
        return self.pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)

    def pad_and_truncate(self, sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
        import numpy as np
        x = (np.ones(maxlen) * value).astype(dtype)
        if truncating == 'pre':
            trunc = sequence[-maxlen:]
        else:
            trunc = sequence[:maxlen]
        trunc = np.asarray(trunc, dtype=dtype)
        if padding == 'post':
            x[:len(trunc)] = trunc
        else:
            x[-len(trunc):] = trunc
        return x



In [0]:
class ABSADataset(Dataset):
    def __init__(self, fname, tokenizer):
        fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        lines = fin.readlines()[:] # 先看前面50筆 ！！！！！！
        fin.close() 

        all_data = []
        for i in tqdm(range(0, len(lines), 2)):
            text = lines[i].strip()
            #print(i)
            polarity = lines[i + 1].strip()
            #print(lines[i])

            # single-sentence classification
            text_raw_bert_indices = tokenizer.text_to_sequence("[CLS] " + text)
            # documentation classification
            text_raw_bert_documents = tokenizer.long_text_to_chunks("[CLS] " + text)
            # label
            polarity = int(polarity)  # range betwee 0 to num_class -1

            data = {
                'text_raw_bert_indices': text_raw_bert_indices,
                'text_raw_bert_documents': text_raw_bert_documents,
                'polarity': polarity,
            }
            all_data.append(data)
            # if i >= 50:
            #   break

        self.data = all_data

    def get_dataframe(self, tokenizer):
        """
        Conver dataset into DataFrame(Pandas)
        It's only support for bert-based model.
        """
        df = []
        columns_name = []
        for i in range(len(self.data)):
            tmp = []
            for k, v in self.data[i].items():
                try:
                    to_str = " ".join(tokenizer.tokenizer.convert_ids_to_tokens(v))
                    tmp.append(to_str)
                except:
                    if k == 'aspect_in_text':
                        # it's a 1-D tensor wtih shape of (2,), representing the start and end index of the aspect
                        v = v.numpy()  # 1-D tensor
                        #print (v.shape)
                    tmp.append(v)
                if i <= 0:
                    columns_name.append(k)
            df.append(tmp)
        df = pd.DataFrame(df,columns=columns_name)
        return df

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

In [0]:
# from google.colab import files
# uploaded = files.upload()
# to check if we have gpu resources
# import tensorflow as tf
# tf.test.is_gpu_available(
#     cuda_only=False,
#     min_cuda_compute_capability=None
# )

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [7]:
drive.mount('/gdrive')

Mounted at /gdrive


In [0]:
# from google.colab import files
# uploaded = files.upload()

In [9]:
max_seq_len = 100
pretrained_bert_name = "bert-base-uncased"
max_num_chunks = 3
train_path = '/gdrive/My Drive/train.txt'
test_path = '/gdrive/My Drive/test.txt'
tokenizer = Tokenizer4Bert(max_seq_len, pretrained_bert_name, max_num_chunks)

100%|██████████| 231508/231508 [00:00<00:00, 5720433.18B/s]


In [10]:
trainset = ABSADataset(train_path, tokenizer)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (694 > 512). Running this sequence through BERT will result in indexing errors
 23%|██▎       | 8301/35918 [01:37<05:16, 87.27it/s]Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (610 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (784 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (788 > 512). Running this sequence through BERT will result in indexing errors
 23%|██▎       | 8310/35918 [01:37<05:23, 85.39it/s]Token indices sequence length is longer than the specified maximum  sequence length for this BERT m

In [11]:
testset = ABSADataset(test_path, tokenizer)
#print ('number of traning data', len(trainset))
#print ('number of testing data', len(testset))

  0%|          | 0/8980 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (658 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (596 > 512). Running this sequence through BERT will result in indexing errors
  0%|          | 11/8980 [00:00<01:25, 104.85it/s]Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (710 > 512). Running this sequence through BERT will result in indexing errors
  0%|          | 20/8980 [00:00<01:31, 97.43it/s] Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (525 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (778 > 512). Running th

In [0]:
train_df = trainset.get_dataframe(tokenizer)
test_df = testset.get_dataframe(tokenizer)

In [13]:
train_df.head()

Unnamed: 0,text_raw_bert_indices,text_raw_bert_documents,polarity
0,[CLS] sydney / wellington ( reuters ) temporar...,"[[101, 3994, 1013, 8409, 1006, 26665, 1007, 57...",0
1,[CLS] washington ( reuters ) u . s . law ##mak...,"[[101, 2899, 1006, 26665, 1007, 1057, 1012, 10...",0
2,[CLS] socialists dream ! check privilege door ...,"[[101, 21633, 3959, 999, 4638, 14293, 2341, 10...",1
3,[CLS] new york ( reuters ) republican presiden...,"[[101, 2047, 2259, 1006, 26665, 1007, 3951, 48...",0
4,"[CLS] timing better . gary tuck ##man , report...","[[101, 10984, 2488, 1012, 5639, 18029, 2386, 1...",1


In [14]:
test_df.head()

Unnamed: 0,text_raw_bert_indices,text_raw_bert_documents,polarity
0,[CLS] seven years right - wing lo ##ons callin...,"[[101, 2698, 2086, 2157, 1011, 3358, 8840, 564...",1
1,[CLS] anonymous hacked number pro - isis twitt...,"[[101, 10812, 28719, 2193, 4013, 1011, 18301, ...",1
2,[CLS] washington ( reuters ) banks reg ##roup ...,"[[101, 2899, 1006, 26665, 1007, 5085, 19723, 2...",0
3,[CLS] beijing ( reuters ) lavish pageant ##ry ...,"[[101, 7211, 1006, 26665, 1007, 22689, 12438, ...",0
4,[CLS] awkward ! progressive hillary clinton as...,"[[101, 9596, 999, 6555, 18520, 7207, 2356, 448...",1


### Hyper-parameters

In [0]:
# Model Parameters for train
learning_rate = 2e-5
num_epoch = 10
batch_size = 32
log_step = 5
valset_ratio = 0
get_tokenized_result = True

In [0]:
# Model Parameters for predict
# model_name = 'bert_ssc'
pretrained_bert_name = 'bert-base-uncased'
dropout = 0.1
bert_dim = 768
polarities_dim = 2
device = None
#state_dict_path = "artifacts/bert_ssc_val_acc"

### Model
- single sentence classification

In [0]:
import torch
import torch.nn as nn


class BERT_SSC(nn.Module):
    """single sentence classification"""
    def __init__(self, bert, opt):
        super(BERT_SSC, self).__init__()
        # self.squeeze_embedding = SqueezeEmbedding()
        self.bert = bert
        self.dropout = nn.Dropout(opt.dropout)
        self.dense = nn.Linear(opt.bert_dim, opt.polarities_dim)

    def forward(self, inputs):
        text_bert_indices = inputs[0]
        _, pooled_output = self.bert(text_bert_indices, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.dense(pooled_output)
        return logits
        

In [18]:
import torch.nn as nn

# Loss and Optimizer
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print ("device",device)
bert = BertModel.from_pretrained(pretrained_bert_name)
criterion = nn.CrossEntropyLoss().to(device)
print ("loss",criterion)

class objectview(object):
    def __init__(self, d):
        self.__dict__ = d
        
opt = {
    "model_name": 'bert_ssc',
    "device":device,
    "log_step": 5,
    "dropout":0.1,
    "hidden_dim":768,
    "bert_dim":768,
    "polarities_dim":2,
    "learning_rate":2e-5,
    "l2reg":0.01,
    "num_epoch":10,
    "batch_size":32,
    "optimizer":torch.optim.Adam,
    "inputs_cols":['text_raw_bert_indices']
}
opt = objectview(opt)

model = BERT_SSC(bert,opt).to(opt.device)
_params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = opt.optimizer(_params, lr=opt.learning_rate, weight_decay=opt.l2reg)
print ("optimizer", optimizer)

device cuda


100%|██████████| 407873900/407873900 [00:07<00:00, 56450257.65B/s]


loss CrossEntropyLoss()
optimizer Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 2e-05
    weight_decay: 0.01
)


In [0]:
#help(model)
#torch.cuda.empty_cache()
#torch.cuda.empty()



### Training precedure

In [0]:
from torch.utils.data import DataLoader, random_split
valset = testset 
train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True)
test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)
val_data_loader = DataLoader(dataset=valset, batch_size=opt.batch_size, shuffle=False)

In [21]:
len(trainset)

35918

In [0]:
from sklearn.metrics import classification_report,confusion_matrix

def _evaluate_acc_f1(data_loader):
    n_correct, n_total = 0, 0
    t_targets_all, t_outputs_all = None, None
    # switch model to evaluation mode
    model.eval()
    with torch.no_grad():
        for t_batch, t_sample_batched in enumerate(data_loader):
            t_inputs = [t_sample_batched[col].to(opt.device) for col in opt.inputs_cols]
            t_targets = t_sample_batched['polarity'].to(opt.device)
            t_outputs = model(t_inputs)

            n_correct += (torch.argmax(t_outputs, -1) == t_targets).sum().item()
            n_total += len(t_outputs)

            if t_targets_all is None:
                t_targets_all = t_targets
                t_outputs_all = t_outputs
            else:
                t_targets_all = torch.cat((t_targets_all, t_targets), dim=0)
                t_outputs_all = torch.cat((t_outputs_all, t_outputs), dim=0)

    acc = n_correct / n_total
    f1 = f1_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[i for i in range(opt.polarities_dim)], average='macro')
    precision = precision_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[i for i in range(opt.polarities_dim)], average='macro')
    recall = recall_score(t_targets_all.cpu(), torch.argmax(t_outputs_all, -1).cpu(), labels=[i for i in range(opt.polarities_dim)], average='macro')
    

    #print(classification_report(t_targets_all.cpu(), t_outputs_all.cpu(), target_names = ['Fake','Not Fake']))
    #print(confusion_matrix(t_targets_all.cpu(), t_outputs_all.cpu()))
    return acc, f1, precision, recall, t_targets_all.cpu(), t_outputs_all.cpu()

In [23]:
import time
start_time = time.time()
# x=1
# print("--- %s seconds ---" % (time.time() - start_time))
opt.device

device(type='cuda')

In [24]:
import time

max_val_acc = 0
max_val_f1 = 0
global_step = 0
#path = None
for epoch in range(opt.num_epoch):
    start_time = time.time()
    print('>' * 100)
    print('epoch: {}'.format(epoch))
    n_correct, n_total, loss_total = 0, 0, 0
    # switch model to training mode
    model.train()
    for i_batch, sample_batched in enumerate(train_data_loader):
        if i_batch <= 0 :
            print (type(sample_batched))
        global_step += 1
        # clear gradient accumulators
        optimizer.zero_grad()
        inputs = [sample_batched[col].to(opt.device) for col in opt.inputs_cols]
        if i_batch <= 0 :
            print ("=" * 100)
            #print ("len inputs : {}".format(len(inputs)))
            #print (len(inputs))
            #print (inputs)
        outputs = model(inputs)
        if i_batch <= 0 :
            print ("=" * 100)
            #print ("len outputs: {}".format(len(outputs)))
            #print (len(outputs))
            #print (outputs)

        targets = sample_batched['polarity'].to(opt.device)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
        n_total += len(outputs)
        loss_total += loss.item() * len(outputs)
        if global_step % opt.log_step == 0:
            train_acc = n_correct / n_total
            train_loss = loss_total / n_total
            print('{}/{}, loss: {}, acc: {}'.format(global_step*64, len(trainset),train_loss, train_acc))
            #logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc))
    print("--- {}s seconds for single epoch ---".format(time.time() - start_time))
    val_acc, val_f1, val_p, val_r, test_true, test_pred = _evaluate_acc_f1(val_data_loader)
    print('> val_acc: {}, val_f1: {}, val_p: {}, val_r: {}'.format(val_acc, val_f1, val_p, val_r))
    #logger.info('> val_acc: {:.4f}, val_f1: {:.4f}, val_p: {:.4f}, val_r: {:.4f}'.format(val_acc, val_f1, val_p, val_r))
    if val_acc > max_val_acc:
        max_val_acc = val_acc
        #if not os.path.exists('state_dict'):
            #os.mkdir('state_dict')
        path = '/gdrive/My Drive/state_dict/{0}_val_acc{1}'.format(opt.model_name, round(val_acc, 4))
        torch.save(model.state_dict(), path)
        print('>> saved: {}'.format(path))
    if val_f1 > max_val_f1:
        max_val_f1 = val_f1
    print ("max_val_acc:{}, max_val_f1:{}".format(max_val_acc, max_val_f1))

128640/35918, loss: 0.014058411457358514, acc: 0.9974985907553551
128960/35918, loss: 0.013986453263615278, acc: 0.9975126121076233
129280/35918, loss: 0.013915371129013091, acc: 0.9975264771460424
129600/35918, loss: 0.01396539406706383, acc: 0.9975055432372506
129920/35918, loss: 0.0139782726518243, acc: 0.9974848401323043
130240/35918, loss: 0.014047114856606513, acc: 0.9974643640350878
130560/35918, loss: 0.013984432511007254, acc: 0.9974781897491821
130880/35918, loss: 0.01396178311128937, acc: 0.9974579718004338


KeyboardInterrupt: ignored

In [29]:
test_true.shape

torch.Size([8980])

In [0]:
test_pred.shape

In [0]:
test_pred_ = torch.argmax(test_pred, -1).cpu()

In [0]:
test_pred_.shape

In [0]:
test_pred_

In [0]:
print (classification_report(test_true, test_pred_, target_names = ['Fake','Not Fake']))
    #print(confusion_matrix(t_targets_all.cpu(), t_outputs_all.cpu()))


In [0]:
cm = confusion_matrix(test_true, test_pred_)
cm

In [0]:
cm = pd.DataFrame(cm , index = ['Fake','Not Fake'] , columns = ['Fake','Not Fake'])
cm

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize = (10,10))
sns.heatmap(cm,cmap= "BuPu_r", linecolor = 'black' ,
            linewidth = 1 , annot = True, fmt='' , 
            xticklabels = ['Fake','Not Fake'] ,
        yticklabels = ['Fake','Not Fake'])


In [0]:
test_df

In [0]:
test_df["if_we_got_correct"] = test_pred_ == test_true


In [0]:
pd.options.display.max_colwidth = 1000
test_df[test_df.if_we_got_correct == False]

In [0]:
test_df.to_csv("/gdrive/My Drive/case_study.csv")