In [1]:
import gensim
import torch
from torch.utils.data import DataLoader, TensorDataset
import nltk
import pandas as pd
from collections import Counter
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

In [56]:
data_file = ['./rt-polaritydata/rt-polarity-pos.txt','./rt-polaritydata/rt-polarity-neg.txt']

In [57]:
max_length = 68
PAD = 0

In [59]:
x_list = []
y_list = []
tokenizer = BertTokenizer.from_pretrained('./bert_base_uncase/')
for i in range(2):
    count = 0
    with open(data_file[i],'r',encoding='utf-8')as f:
        for line in f.readlines():
            count+=1
            sentence = line.strip().lower()
            x = list(tokenizer.encode(sentence))
            x = x[:max_length]
            n_pad = max_length - len(x)
            x = x + n_pad * [PAD]
            x_list.append(x)
        y_list = y_list + [i] *  count
X = np.array(x_list,dtype=np.int64)
Y = np.array(y_list,dtype=np.int64)

In [60]:
print(X.shape,Y.shape)

(10661, 68) (10661,)


In [61]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.1,random_state=1)

In [62]:
train_dataset = TensorDataset(torch.from_numpy(x_train),torch.from_numpy(y_train))
test_dataset = TensorDataset(torch.from_numpy(x_test),torch.from_numpy(y_test))
train_loader = DataLoader(train_dataset,batch_size=16,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=16)

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
!nvidia-smi

cuda
Tue Jul 14 15:05:27 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:02:00.0 Off |                  N/A |
| 85%   73C    P2   215W / 250W |   6190MiB / 11019MiB |     60%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:04:00.0 Off |                  N/A |
| 93%   78C    P2   242W / 250W |   1384MiB / 11019MiB |     57%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:83:00.0 Off |                  N/A |
| 9

In [64]:
model = BertForSequenceClassification.from_pretrained('./bert_base_uncase/')

In [65]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
loss_func = nn.CrossEntropyLoss()

In [76]:
for epoch in range(4):
    model.train()
    for batch_x,batch_y in train_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        mask = []
        for single_sentence in batch_x:
            mask.append([1 if i != 0 else 0 for i in single_sentence])
        attention_mask = torch.Tensor(mask).to(device)
        batch_out = model(batch_x, attention_mask=attention_mask)
        #print(batch_out)
        loss = loss_func(batch_out[0],batch_y)
        #print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    y_true = []
    y_pred = []
    for batch_x,batch_y in test_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        mask = []
        for single_sentence in batch_x:
            mask.append([1 if i != 0 else 0 for i in single_sentence])
        attention_mask = torch.Tensor(mask).to(device)
        batch_pred = model(batch_x,attention_mask=attention_mask)[0].argmax(dim=-1)
        #print(batch_pred)
        for y in batch_pred.cpu().numpy():
            y_pred.append(y)
        for y in batch_y.cpu().numpy():
            y_true.append(y)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    f1_score = metrics.f1_score(y_true,y_pred)
    print("epoch %d\nLoss:%.9f  Test_accuracy: %.9f,  Test_f1_score: %.8f" %(epoch+1,loss,accuracy,f1_score))

tensor([0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], device='cuda:0')
tensor([1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], device='cuda:0')
tensor([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0], device='cuda:0')
tensor([1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0], device='cuda:0')
tensor([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1], device='cuda:0')
tensor([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1], device='cuda:0')
tensor([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0], device='cuda:0')
tensor([0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
tensor([1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0], device='cuda:0')
tensor([0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1], device='cuda:0')
tensor([1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1], device='cuda:0')
tensor([1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0], device='cuda:0')
tensor([1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 

# BERT test

In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('./bert_base_uncase/')

In [3]:
text_batch = ["I love Pixar.", "I don't care for Pixar."]
tokenizer.encode(text_batch[0].strip().lower())

[101, 1045, 2293, 14255, 18684, 2099, 1012, 102]

In [41]:
tokenizer.encode(text_batch[0])

[101, 1045, 2293, 14255, 18684, 2099, 1012, 102]

In [68]:
input_ids= list(tokenizer.encode(text_batch[0]))
input_ids = np.array(input_ids,dtype=np.int64)

In [69]:
input_ids = torch.from_numpy(input_ids)

In [70]:
input_ids

tensor([  101,  1045,  2293, 14255, 18684,  2099,  1012,   102])

In [84]:
!pip list

Package                    Version            
-------------------------- -------------------
anaconda-client            1.7.2              
anaconda-navigator         1.9.2              
asn1crypto                 1.3.0              
attrs                      19.3.0             
backcall                   0.1.0              
bcrypt                     3.1.7              
beautifulsoup4             4.8.1              
bert-extractive-summarizer 0.3.0              
bleach                     3.1.0              
blis                       0.2.4              
boto                       2.49.0             
boto3                      1.14.12            
botocore                   1.17.12            
Bottleneck                 1.3.2              
cachetools                 4.1.0              
certifi                    2020.6.20          
cffi                       1.14.0             
chardet                    3.0.4              
Click                      7.0                
clyent       

In [4]:
encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
encoding

{'input_ids': tensor([[  101,  1045,  2293, 14255, 18684,  2099,  1012,   102,     0,     0,
             0,     0],
        [  101,  1045,  2123,  1005,  1056,  2729,  2005, 14255, 18684,  2099,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [71]:
labels = torch.tensor([1]).unsqueeze(0)

In [72]:
labels

tensor([[1]])

In [6]:
model = BertForSequenceClassification.from_pretrained('./bert_base_uncase/')

Some weights of the model checkpoint at ./bert_base_uncase/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint 

In [25]:
from transformers import BertModel
model = BertModel.from_pretrained('./bert_base_uncase/')

In [26]:
outputs = model(input_ids)

In [24]:
outputs[0][0]


tensor([[-0.2516, -0.1196, -0.0086,  ..., -0.2398,  0.8604,  0.1466],
        [ 0.2876,  0.2653, -0.4760,  ...,  0.0110,  0.6645,  0.1701],
        [ 1.3843,  0.6453,  0.4205,  ..., -0.3122,  0.6765,  0.2959],
        ...,
        [ 0.1922, -0.7967,  0.7331,  ..., -0.4473, -0.0265, -0.8457],
        [ 0.3355, -0.9831,  0.6524,  ..., -0.5149,  0.1389, -0.7667],
        [ 0.1508, -0.7344,  0.1803,  ..., -0.5091,  0.0310, -1.1885]],
       grad_fn=<SelectBackward>)

In [None]:
loss = outputs[0]
loss.backward()
optimizer.step()