# Bert for sentiment score

In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |▋                               | 10kB 20.9MB/s eta 0:00:01[K     |█▎                              | 20kB 3.0MB/s eta 0:00:01[K     |██                              | 30kB 4.3MB/s eta 0:00:01[K     |██▋                             | 40kB 2.9MB/s eta 0:00:01[K     |███▎                            | 51kB 3.6MB/s eta 0:00:01[K     |████                            | 61kB 4.2MB/s eta 0:00:01[K     |████▋                           | 71kB 4.8MB/s eta 0:00:01[K     |█████▎                          | 81kB 5.4MB/s eta 0:00:01[K     |██████                          | 92kB 6.1MB/s eta 0:00:01[K     |██████▋                         | 102kB 4.7MB/s eta 0:00:01[K     |███████▏                        | 112kB 4.7MB/s eta 0:00:01[K     |███████▉                        | 122kB 4.7M

In [0]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
len(tokenizer.vocab)

30522

In [0]:

tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [0]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [0]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [0]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [0]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [0]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [0]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [0]:
from torchtext import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [0]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 67.9MB/s]


In [0]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [0]:
print(vars(train_data.examples[6]))

{'text': [2821, 6203, 2935, 1012, 2023, 3185, 1012, 1012, 1012, 2009, 2001, 9202, 1012, 1045, 2572, 1037, 4121, 5470, 1997, 5469, 5691, 1012, 1998, 2087, 1997, 1996, 2051, 1010, 5469, 5691, 2060, 2111, 2360, 2024, 2919, 1010, 1045, 2066, 1012, 1996, 3364, 2040, 2209, 1005, 12665, 24375, 1005, 2001, 6429, 1010, 1045, 2097, 2360, 2008, 1012, 2021, 2023, 5436, 2001, 9643, 1012, 2009, 2081, 2053, 3168, 999, 2009, 2018, 2126, 2205, 2172, 13638, 1010, 1998, 2019, 14203, 1006, 1998, 10073, 2075, 1007, 3348, 3496, 2012, 1996, 2927, 1012, 1045, 2079, 2903, 1996, 2472, 2001, 2667, 2000, 2022, 1005, 16880, 1005, 2030, 2054, 17048, 1010, 2021, 2009, 2074, 2234, 2041, 9643, 1012, 2000, 5587, 2000, 1996, 8632, 1997, 17037, 7999, 10231, 2027, 2170, 1037, 5436, 1010, 1996, 5889, 1006, 4661, 1005, 12665, 24375, 1005, 1007, 2057, 1005, 2128, 9643, 1010, 1998, 1045, 8725, 2061, 2210, 2055, 2068, 2008, 1045, 2574, 9471, 2040, 2001, 2040, 1012, 1999, 7091, 1010, 2023, 3185, 2081, 2033, 5305, 1012, 2065, 20

In [0]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[5])['text'])

print(tokens)

['this', 'movie', 'started', 'slowly', ',', 'then', 'gained', 'momentum', 'towards', 'the', 'middle', '.', 'however', ',', 'the', 'fact', 'that', 'the', 'movie', 'ran', 'over', 'two', 'nights', 'broke', 'that', 'momentum', 'at', 'its', 'peak', '.', 'the', 'second', 'part', 'really', 'got', 'interesting', ',', 'but', 'then', 'gave', 'way', 'to', 'a', 'simply', 'pathetic', 'ending', '.', 'playing', 'football', 'in', 'the', 'yard', '?', 'really', ',', 'could', 'it', 'get', 'any', 'more', 'sap', '##py', 'and', 'maud', '##lin', '?', 'now', 'i', 'hear', 'plans', 'for', 'a', 'similar', 'movie', 'based', 'on', 'the', "'", '70s', '.', 'i', 'won', "'", 't', 'make', 'any', 'great', 'efforts', 'to', 'tune', 'into', 'that', 'one', 'if', 'it', "'", 's', 'anything', 'like', '"', 'the', "'", '60s', '.', '"']


In [0]:
LABEL.build_vocab(train_data)

In [0]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f37bf561488>, {'neg': 0, 'pos': 1})


In [0]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [0]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [0]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):       
        super().__init__()        
        self.bert = bert       
        embedding_dim = bert.config.to_dict()['hidden_size']       
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):      
        #text = [batch size, sent len]       
        with torch.no_grad():
            embedded = self.bert(text)[0]         
        #embedded = [batch size, sent len, emb dim] 
        _, hidden = self.rnn(embedded)
        #hidden = [n layers * n directions, batch size, emb dim]
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])  
        #hidden = [batch size, hid dim]
        output = self.out(hidden)
        #output = [batch size, out dim]
        return output

In [0]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [0]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [0]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.BCEWithLogitsLoss()

In [0]:
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)            
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()       
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 6m 46s
	Train Loss: 0.485 | Train Acc: 75.80%
	 Val. Loss: 0.303 |  Val. Acc: 87.60%
Epoch: 02 | Epoch Time: 6m 46s
	Train Loss: 0.273 | Train Acc: 88.93%
	 Val. Loss: 0.244 |  Val. Acc: 89.79%
Epoch: 03 | Epoch Time: 6m 46s
	Train Loss: 0.234 | Train Acc: 90.61%
	 Val. Loss: 0.212 |  Val. Acc: 91.50%
Epoch: 04 | Epoch Time: 6m 46s
	Train Loss: 0.203 | Train Acc: 92.07%
	 Val. Loss: 0.225 |  Val. Acc: 90.99%
Epoch: 05 | Epoch Time: 6m 46s
	Train Loss: 0.176 | Train Acc: 93.41%
	 Val. Loss: 0.274 |  Val. Acc: 89.15%


In [0]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.203 | Test Acc: 91.89%


In [0]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [0]:
predict_sentiment(model, tokenizer, "This film is terrible")

0.01449573040008545

In [0]:
predict_sentiment(model, tokenizer, "This film is great")

0.9313568472862244

In [0]:
predict_sentiment(model, tokenizer, "As far as China and the United States are concerned, a tenuous truce seems to have been declared.")

0.5790801048278809

In [0]:
article = 'As far as China and the United States are concerned, a tenuous truce seems to have been declared. Phase 1, signed last week, eases some Trump administration sanctions on China in return for Beijing’s vow to step up its purchases of American farm products and other goods. The ongoing trade challenges lie not only with China. In an election year, the Europeans, with their trade surpluses in autos and luxury goods, could also be a tempting target for President Trump. Swing voters in the Rust Belt helped put Mr. Trump in the White House. Those states are reeling from a half-century of traumatic structural change: domestic deindustrialization compounded by the way global trade has shifted manufacturing jobs to places like China and Mexico from places like the Midwest. Furthermore, when officials like Robert Lighthizer, the United States trade representative, declare that the system must be changed because it was not designed to deal with the ascent of China, they have a point. China’s scale and pace of development, its no-holds-barred approach to competition and the authoritarian regime that backs it fundamentally put in question the liberal model of globalism and win-win trade relations. The Europeans increasingly look like the last man standing when it comes to free trade. The United States, with its long history of protectionism, was always a somewhat reluctant recruit to the camp of free trade. In the 21st century, with the emergence of China and India, the United States has company on the global stage. China, under its current rulers, is a resurgent and assertive nation-state that poses a fundamental challenge to the power position America built in Asia during the Cold War. What is at stake is more than trade. It is geopolitics. China is ruled by a Communist Party that pays lip service to the cult of Mao. America’s positions in Korea, Japan, Taiwan and the South China Sea are legacies of that era. But in the Cold War with the Soviet Union there was never the depth of economic, technological and cultural interconnection that the West has forged with China since the 1980s. As an alternative historical analogy, some are tempted to invoke the rise of Imperial Germany before 1914 with which the British Empire entertained a similar mixture of rivalry and cooperation. But that comparison belittles the significance of China’s re-emergence. Mr. Trump’s actions and policies put the onus more firmly on the Europeans and the Chinese to find a way of shaping the new environmental politics of trade — if necessary, without the United States. '
sentences = article.split('.')
score = 0
sentence_num = len(sentences)
for sentence in sentences:
  score += predict_sentiment(model, tokenizer, sentence)
final_score = score / sentence_num
print("文章最终得分: " + str(final_score))

if final_score > 0.5:
  print('positive')
else:
  print('negtive')

文章最终得分: 0.6552429030770841
positive


In [0]:
import xlwt,xlrd
from tqdm import tqdm
# from xlutils.copy import copy

rd = xlrd.open_workbook('/content/drive/My Drive/dataset/Final_dataset.xlsx')
table = rd.sheets()[0]
ncols = table.ncols
Contents = table.col(2, start_rowx=1, end_rowx=None)

scores = []

for i in tqdm(Contents):
  sentences = i.value.split('.')
  score = 0
  for sentence in sentences:
    score += predict_sentiment(model, tokenizer, sentence)
  final_score = score / len(sentences)
  scores.append(final_score)

In [0]:
scores

[0.6155946105718613,
 0.5503573179244995,
 0.6480177417397499,
 0.6324603408575058,
 0.6287717938423156,
 0.4910487234592438,
 0.6244489848613739,
 0.5390445291996002,
 0.5543693453073502,
 0.5948373794555664,
 0.521098181605339,
 0.5716308504343033,
 0.6993886431058248,
 0.6731832797328631,
 0.6718828777472178,
 0.5994740469115121,
 0.7019552886486053,
 0.6199848055839539,
 0.6567504370496386,
 0.6199848055839539,
 0.577016847829024,
 0.5694843182961146,
 0.6314482277347928,
 0.6097525060176849,
 0.6199848055839539,
 0.6961329480012258,
 0.514267647266388,
 0.6134646673997243,
 0.5199295461177826,
 0.6780834243847773,
 0.6240794020039695,
 0.5256382301449776,
 0.7083589434623718,
 0.6548911452293396,
 0.6735131502151489,
 0.6670482556025187,
 0.6786004304885864,
 0.563682809472084,
 0.6143944940783761,
 0.5956969559192657,
 0.6717874705791473,
 0.6506858170032501,
 0.5649502310487959,
 0.608856980737887,
 0.5766851007938385,
 0.7505759994188944,
 0.7375001162290573,
 0.640418228777972

In [0]:
!pip install xlutils
import xlwt,xlrd
from xlutils.copy import copy

rd = xlrd.open_workbook('/content/drive/My Drive/dataset/Final_dataset.xlsx')
wt = copy(rd)
sh = wt.get_sheet(0)
line = 1
for i in scores:
  sh.write(line,9,i)
  line += 1
wt.save('/content/drive/My Drive/dataset/Final_dataset.xlsx')

Collecting xlutils
[?25l  Downloading https://files.pythonhosted.org/packages/c7/55/e22ac73dbb316cabb5db28bef6c87044a95914f713a6e81b593f8a0d2f79/xlutils-2.0.0-py2.py3-none-any.whl (55kB)
[K     |██████                          | 10kB 22.9MB/s eta 0:00:01[K     |████████████                    | 20kB 3.1MB/s eta 0:00:01[K     |█████████████████▉              | 30kB 4.4MB/s eta 0:00:01[K     |███████████████████████▉        | 40kB 2.9MB/s eta 0:00:01[K     |█████████████████████████████▊  | 51kB 3.5MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.9MB/s 
Installing collected packages: xlutils
Successfully installed xlutils-2.0.0
