In [1]:
import pandas as pd
import numpy as np
import torch
import tqdm
import os
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
from bert_utils import DocumentSentimentDataset, DocumentSentimentDataLoader, forward_sequence_classification, get_lr, count_param, metrics_to_string, set_seed, document_sentiment_metrics_fn, evaluate, training, testing

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="7"
tf_device='/gpu:7'

In [3]:
tokenizer = BertTokenizer.from_pretrained("indolem/indobertweet-base-uncased")
config = BertConfig.from_pretrained("indolem/indobertweet-base-uncased")
config.num_labels = 2
model = BertForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", config=config)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indober

In [4]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{0: 0, 1: 1}
{0: 'negative', 1: 'positive'}


In [5]:
train = DocumentSentimentDataset('../Dataset/Split/Two_Label/train_posneg_raw_data_sentiment2.csv', tokenizer)
valid = DocumentSentimentDataset('../Dataset/Split/Two_Label/valid_posneg_raw_data_sentiment2.csv', tokenizer)
test = DocumentSentimentDataset('../Dataset/Split/Two_Label/test_posneg_raw_data_sentiment2.csv', tokenizer)
train_loader = DocumentSentimentDataLoader(dataset=train, max_seq_len=256, batch_size=16, num_workers=1, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid, max_seq_len=256, batch_size=16, num_workers=1, shuffle=True)
test_loader = DocumentSentimentDataLoader(dataset=test, max_seq_len=256, batch_size=16, num_workers=1, shuffle=True)

In [6]:
torch.cuda.empty_cache()

In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
model = model.cuda()

In [8]:
set_seed(9112021)

In [9]:
training(model, 1, train_loader, valid_loader, optimizer, i2w)

(Epoch 1) TRAIN LOSS:0.5592 LR:0.00005000: 100%|██████████████████████████████████████| 298/298 [00:26<00:00, 11.45it/s]


(Epoch 1) TRAIN LOSS:0.5592 ACC:0.72 F1:0.72 REC:0.72 PRE:0.72 LR:0.00005000


VALID LOSS:0.4484 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79: 100%|█████████████████████████████| 75/75 [00:02<00:00, 30.48it/s]


26.189090728759766
5.766054153442383


In [10]:
testing(model, test_loader, i2w)

TEST LOSS:0.4144 ACC:0.81 F1:0.81 REC:0.81 PRE:0.81: 100%|██████████████████████████████| 94/94 [00:02<00:00, 33.21it/s]

2.9615983963012695
TEST LOSS:1.0000 0.4143971654962986



