In [1]:
import gensim
import torch
from torch.utils.data import DataLoader, TensorDataset
import nltk
import pandas as pd
from collections import Counter
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

In [2]:
data_file = ['./rt-polaritydata/rt-polarity-pos.txt','./rt-polaritydata/rt-polarity-neg.txt']

In [3]:
max_length = 68
PAD = 0

In [4]:
x_list = []
y_list = []
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
for i in range(2):
    count = 0
    with open(data_file[i],'r',encoding='utf-8')as f:
        for line in f.readlines():
            count+=1
            sentence = line.strip().lower()
            x = tokenizer.encode(sentence)
            x = x[:max_length]
            n_pad = max_length - len(x)
            x = x + n_pad * [PAD]
            x_list.append(x)
        y_list = y_list + [i] *  count
X = np.array(x_list,dtype=np.int64)
Y = np.array(y_list,dtype=np.int64)

In [5]:
print(X.shape,Y.shape)

(10661, 68) (10661,)


In [6]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.1,random_state=1)

In [7]:
train_dataset = TensorDataset(torch.from_numpy(x_train),torch.from_numpy(y_train))
test_dataset = TensorDataset(torch.from_numpy(x_test),torch.from_numpy(y_test))
train_loader = DataLoader(train_dataset,batch_size=16,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=16)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
!nvidia-smi

cuda
Mon Jul 13 14:10:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:02:00.0 Off |                  N/A |
| 30%   30C    P8    14W / 250W |   1056MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:04:00.0 Off |                  N/A |
| 30%   37C    P8    17W / 250W |     11MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:83:00.0 Off |                  N/A |
| 3

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
loss_func = nn.CrossEntropyLoss()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…

In [None]:
for epoch in range(4):
    model.train()
    for batch_x,batch_y in train_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        mask=[]
        for single_sequence in batch_x:
            mask.append([1 if i !=0 else 0 for i in single_sequence])
        batch_out = model(batch_x, attention_mask=mask)
        loss = loss_func(batch_out[0],batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    y_true = []
    y_pred = []
    for batch_x,batch_y in test_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        mask=[]
        for single_sequence in batch_x:
            mask.append([1 if i !=0 else 0 for i in single_sequence])
        batch_pred = model(batch_x,attention_mask=mask).argmax(dim=-1)
        for y in batch_pred.cpu().numpy():
            y_pred.append(y)
        for y in batch_y.cpu().numpy():
            y_true.append(y)
    accuracy = metrics.accuracy_score(y_true,y_pred)
    f1_score = metrics.f1_score(y_true,y_pred)
    print("epoch %d\nLoss:%.9f  Test_accuracy: %.9f,  Test_f1_score: %.8f" %(epoch+1,loss,accuracy,f1_score))

# BERT test

In [40]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_batch = ["I love Pixar.", "I don't care for Pixar."]

In [49]:
tokenizer.encode(text_batch[0].strip().lower())

[101, 1045, 2293, 14255, 18684, 2099, 1012, 102]

In [45]:
tokenizer.encode(text_batch[0])

[101, 1045, 2293, 14255, 18684, 2099, 1012, 102]

In [43]:
sentence = "I love Pixar."
ids =[tokenizer.convert_tokens_to_ids(i) for i in tokenizer.tokenize(sentence)]

In [44]:
ids

[1045, 2293, 14255, 18684, 2099, 1012]

In [None]:
batch_size = 64

In [None]:
labels = torch.tensor(Y).unsqueeze(0)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
loss.backward()
optimizer.step()