In [1]:
from transformers import pipeline
import torch
import torch.nn as nn
import torch.optim as opt
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
import os
import re
import numpy as np
from tqdm import tqdm
import torch.utils.data as data
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
pipe = pipeline("fill-mask", model="bert-base-uncased")

# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.La

In [2]:
df = pd.read_csv('/home/P78081057/NCKU_NLP_Practice/IMDB Dataset.csv')
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})
print(df['sentiment'])
reviews = df['review'].tolist()
sentiments = df['sentiment'].tolist()


0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [3]:
# 設立隨機種子來控制隨機過程
random_seed = 42

# 設定要分出多少比例的 validation data
valid_ratio = 0.2

train_texts, val_texts, train_labels, val_labels = train_test_split(
    reviews, 
    sentiments,
    test_size=valid_ratio, 
    random_state=random_seed
)

In [4]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [5]:

print(val_encodings.keys())
print(val_encodings.input_ids[0])
print(val_encodings.attention_mask[0])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[101, 1045, 2428, 4669, 2023, 10945, 10278, 2349, 2000, 1996, 2298, 1997, 1996, 5196, 1010, 1996, 14694, 1998, 2074, 1996, 2298, 3452, 2001, 5875, 2000, 2033, 2005, 2070, 3114, 1012, 4312, 2015, 1010, 2023, 2071, 2031, 2042, 2028, 1997, 1996, 2190, 10945, 10278, 1005, 1055, 2412, 2065, 1996, 16779, 2134, 1005, 1056, 2031, 17244, 11320, 4590, 1999, 1996, 2364, 2724, 2114, 28758, 9759, 2532, 1010, 2085, 2005, 2009, 1005, 1055, 2051, 2009, 2001, 7929, 2000, 2031, 1037, 4121, 6638, 2158, 5443, 1037, 2844, 2158, 2021, 1045, 1005, 1049, 5580, 2335, 2031, 2904, 1012, 2009, 2001, 1037, 6659, 2364, 2724, 2074, 2066, 2296, 2674, 11320, 4590, 2003, 1999, 2003, 6659, 1012, 2060, 3503, 2006, 1996, 4003, 2020, 15082, 12716, 5443, 6945, 4487, 11607, 3366, 1010, 21264, 3428, 5443, 16581, 4230, 1010, 13218, 17784, 5443, 20099, 21863, 2075, 1010, 2023, 2001, 1996, 2724, 2073, 13218, 2315, 2010, 2502, 6071, 1997, 1037, 2303, 3457, 7937, 1010, 2

In [8]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # Dataset class 的 parameters 放入我們 tokenization 後的資料以及資料的標籤
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # 請注意 tokenization 後的資料是一個 dict
        # 在此步驟將資料以及標籤都轉換為 PyTorch 的 tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        # 回傳資料集的總數
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
#test_dataset = IMDbDataset(test_encodings, test_labels)

In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [10]:
import transformers
training_args = transformers.TrainingArguments(
    output_dir='./results',          # 輸出的資料夾
    num_train_epochs=3,              # 總共訓練的 epoch 數目
    learning_rate=2e-5,              # 學習率
    per_device_train_batch_size=8,  # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=64,   # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=2,   # 梯度累積的步數
    warmup_steps=500,                # learning rate scheduler 的參數
    weight_decay=0.01,               # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy='steps',     # 設定驗證的時機
    save_strategy='steps',           # 設定儲存的時機
    save_steps=500,                  # 設定多少步驟儲存一次模型
    eval_steps=500,                  # 設定多少步驟驗證一次模型
    load_best_model_at_end=True,     # 是否在訓練結束後載入最好的模型
    metric_for_best_model='eval_loss',      # 設定最好的模型的指標
    report_to='tensorboard',         # 是否將訓練結果儲存到 TensorBoard
    save_total_limit=10,              # 最多儲存幾個模型
    logging_dir='./logs',            # 存放 log 的資料夾
    logging_steps=10,
    seed=random_seed
)

trainer = transformers.Trainer(
    model=model,                         # 🤗 的模型
    args=training_args,                  # Trainer 所需要的引數
    train_dataset=train_dataset,         # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=val_dataset,            # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    compute_metrics=compute_metrics      # 自定的評估的指標
)

# 指定使用 1 個 GPU 進行訓練
trainer.args._n_gpu=1
# 開始進行模型訓練
trainer.train()


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.2495,0.230097,0.9111,0.912491,0.905273,0.919825
1000,0.2508,0.217491,0.9195,0.922499,0.895849,0.950784
1500,0.1681,0.199067,0.9278,0.926775,0.947729,0.906728
2000,0.1584,0.188306,0.9315,0.932023,0.932116,0.931931
2500,0.2097,0.177908,0.9382,0.937939,0.94938,0.926771
3000,0.0556,0.212629,0.94,0.940828,0.935111,0.946616
3500,0.122,0.224284,0.9334,0.932165,0.957522,0.908117
4000,0.0754,0.203566,0.9422,0.943133,0.93522,0.951181
4500,0.087,0.212819,0.9413,0.941023,0.952991,0.929351
5000,0.0558,0.199207,0.9428,0.944053,0.930762,0.95773


TrainOutput(global_step=7500, training_loss=0.1554348524181793, metrics={'train_runtime': 15452.8842, 'train_samples_per_second': 7.766, 'train_steps_per_second': 0.485, 'total_flos': 3.15733266432e+16, 'train_loss': 0.1554348524181793, 'epoch': 3.0})

In [11]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,