<a href="https://colab.research.google.com/github/siting1206/NLP_FinalProject/blob/main/DeBERTa_CoLA_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation & Import Package

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Download the CoLA Dataset


In [3]:
!pip install git+https://github.com/Adapter-Hub/adapter-transformers.git
!pip install wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/Adapter-Hub/adapter-transformers.git
  Cloning https://github.com/Adapter-Hub/adapter-transformers.git to /tmp/pip-req-build-f3i3nbh2
  Running command git clone -q https://github.com/Adapter-Hub/adapter-transformers.git /tmp/pip-req-build-f3i3nbh2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 36.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.3 MB/s 
Building wheels for collected packages: adapter-transformers
  Building wheel for adapte

In [4]:
import wget
import os

print('Downloading dataset...')

# 資料集的下載連結
url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

# 如本地沒有，則下載資料集 
if not os.path.exists('/content/drive/MyDrive/NLP_final_project/cola_public_1.1.zip'):
    wget.download(url, '/content/drive/MyDrive/NLP_final_project/cola_public_1.1.zip')

Downloading dataset...


In [5]:
# 如果沒解壓過，則解壓zip包
if not os.path.exists('/content/drive/MyDrive/NLP_final_project/cola_public/'):
    !unzip /content/drive/MyDrive/NLP_final_project/cola_public_1.1.zip

In [6]:
import pandas as pd

# 載入資料集到 pandas 的 dataframe 中
df = pd.read_csv("/content/drive/MyDrive/NLP_final_project/cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# 列印資料集的記錄數
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# 隨機預覽10筆資料 主要看sentence, label(0-語法不可接受, 1-語法可接受)
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
3505,ks08,1,,There is eager to be fifty students in this cl...
227,cj99,1,,"Once Janet left, Fred became much crazier."
2250,l-93,1,,The mouse nibbled the cheese.
5332,b_73,1,,I've never seen a man taller than my father.
1220,r-67,1,,That gangsters had bribed him was denied by th...
7704,ad03,0,*,What I said that was we would go.
1539,r-67,0,*,John is prouder of having gone than nobody exp...
6772,m_02,0,*,Because Dr Jones ate too much rich food didn't...
6609,m_02,1,,Which club did you hit the winning putt with?
6368,d_98,1,,Any philosopher is sometimes wrong.


In [7]:
# 隨機抽樣五筆語法上不可接受的資料
df.loc[df.label == 0].sample(5)[['sentence', 'label']]

Unnamed: 0,sentence,label
6933,We are knowing this theory.,0
360,How do you wonder whether John said Mary solve...,0
466,John was unknown to be the murderer.,0
4690,Loren was relied on by Pavarotti and Hepburn o...,0
7108,Kim likes Sandy and Lee likes to Leslie.,0


In [8]:
# 構建 sentences 和 labels 列表
sentences = df.sentence.values
labels = df.label.values

In [9]:
from transformers import DebertaTokenizer
print('Loading DeBERTa tokenizer...')
tokenizer = DebertaTokenizer.from_pretrained("dweb/deberta-base-CoLA")

Loading DeBERTa tokenizer...


Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

In [10]:
print(' Original: ', sentences[0])

print('Tokenized: ', tokenizer.tokenize(sentences[0]))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  Our friends won't buy this analysis, let alone the next one we propose.
Tokenized:  ['Our', 'Ġfriends', 'Ġwon', "'t", 'Ġbuy', 'Ġthis', 'Ġanalysis', ',', 'Ġlet', 'Ġalone', 'Ġthe', 'Ġnext', 'Ġone', 'Ġwe', 'Ġpropose', '.']
Token IDs:  [2522, 964, 351, 75, 907, 42, 1966, 6, 905, 1937, 5, 220, 65, 52, 15393, 4]


In [11]:
max_len = 0
for sent in sentences:

    # 將文字分詞，並新增 `[CLS]` 和 `[SEP]` 符號
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  47


In [12]:
# 將資料集分完詞後儲存到列表中
input_ids = []
attention_masks = []
token_type_ids = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,              # 輸入文字
        add_special_tokens = True,   # 新增 '[CLS]' 和 '[SEP]'
        max_length = 64,        # 填充 & 截斷長度
        pad_to_max_length = True,
        return_attention_mask = True, # 返回 attn. masks.
        return_tensors = 'pt',     # 返回 pytorch tensors 格式的資料
    )
     
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

    token_type_ids.append(encoded_dict['token_type_ids'])



# 將列表轉換為 tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)

# 輸出第 1 行文字的原始和編碼後的資訊
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  Our friends won't buy this analysis, let alone the next one we propose.
Token IDs: tensor([    1,  2522,   964,   351,    75,   907,    42,  1966,     6,   905,
         1937,     5,   220,    65,    52, 15393,     4,     2,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


### 將 90% 的資料集作為訓練集，剩下的 10% 作為驗證集

In [14]:
from torch.utils.data import TensorDataset, random_split

# 將輸入資料合併為 TensorDataset 物件
dataset = TensorDataset(input_ids, attention_masks, token_type_ids)

# 計算訓練集和驗證集大小
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# 按照資料大小隨機拆分訓練集和測試集
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

7,695 training samples
  856 validation samples


In [15]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# 在 fine-tune 的訓練中，BERT 建議的小批量為 16 或 32
batch_size = 32

# 為訓練和驗證集建立 Dataloader，對訓練樣本隨機洗牌
train_dataloader = DataLoader(
            train_dataset,  # 訓練樣本
            sampler = RandomSampler(train_dataset), # 隨機小批量
            batch_size = batch_size # 以小批量進行訓練
        )

# 驗證集不需要隨機化，這裡順序讀取就好
validation_dataloader = DataLoader(
            val_dataset, # 驗證樣本
            sampler = SequentialSampler(val_dataset), # 順序選取小批量
            batch_size = batch_size 
        )

In [16]:
from transformers import DebertaConfig, DebertaModel, AdamW, AdapterType

# 載入 BertForSequenceClassification, 預訓練 BERT 模型 + 頂層的線性分類層 
model = DebertaModel.from_pretrained(
    "dweb/deberta-base-CoLA", # 預訓練模型
    num_labels = 2, # 分類數 --2 表示二分類
                    # 你可以改變這個數字，用於多分類任務  
    output_attentions = False, # 模型是否返回 attentions weights.
    output_hidden_states = False, # 模型是否返回所有隱層狀態.
    return_dict=False
)

# 在 gpu 中執行該模型
model.cuda()

Downloading:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/557M [00:00<?, ?B/s]

Some weights of the model checkpoint at dweb/deberta-base-CoLA were not used when initializing DebertaModel: ['classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DebertaModel(
  (shared_parameters): ModuleDict()
  (invertible_adapters): ModuleDict()
  (embeddings): DebertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=0)
    (LayerNorm): DebertaLayerNorm()
    (dropout): StableDropout()
  )
  (encoder): DebertaEncoder(
    (layer): ModuleList(
      (0): DebertaLayer(
        (attention): DebertaAttention(
          (self): DisentangledSelfAttention(
            (in_proj): MergedLinear(
              in_features=768, out_features=2304, bias=False
              (loras): ModuleDict()
            )
            (pos_dropout): StableDropout()
            (pos_proj): Linear(in_features=768, out_features=768, bias=False)
            (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
            (dropout): StableDropout()
            (prefix_tuning): PrefixTuningShim(
              (prefix_gates): ModuleDict()
              (pool): PrefixTuningPool(
                (prefix_tunings): ModuleDict()
              )
   

In [17]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)



In [18]:
from transformers import get_linear_schedule_with_warmup

# 訓練 epochs。 BERT 作者建議在 2 和 4 之間，設大了容易過擬合 
epochs = 5

# 總訓練樣本數
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [19]:
import numpy as np

# 根據預測結果和標籤資料來計算準確率
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [20]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # 四捨五入到最近的秒
    elapsed_rounded = int(round((elapsed)))
    
    # 格式化為 hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [22]:
import random
import numpy as np

# 設定隨機種子值，以確保輸出是確定的
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 儲存訓練和評估的 loss、準確率、訓練時長等統計指標, 
training_stats = []

# 統計整個訓練時長
total_t0 = time.time()


for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 統計單次 epoch 的訓練時間
    t0 = time.time()

    total_train_loss = 0

    # 將模型設定為訓練模式。
    # dropout、batchnorm 層在訓練和測試模式下的表現是不同的 (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # 訓練集小批量迭代
    for step, batch in enumerate(train_dataloader):

        # 每經過40次迭代，就輸出進度資訊
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 準備輸入資料，並將其拷貝到 gpu 中
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_token = batch[2].to(device)

        # 每次計算梯度前，都需要將梯度清 0，因為 pytorch 的梯度是累加的
        model.zero_grad()        

        # 該函式會根據不同的引數，會返回不同的值。 本例中, 會返回 loss 和 logits -- 模型的預測結果
        loss, logits = model(b_input_ids, 
                attention_mask=b_input_mask,
                token_type_ids=b_token)

        # 累加 loss
        total_train_loss += loss.item()

        # 反向傳播
        loss.backward()

        # 梯度裁剪，避免出現梯度爆炸情況
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 更新引數
        optimizer.step()

        # 更新學習率
        scheduler.step()

    # 平均訓練誤差
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # 單次 epoch 的訓練時長
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # 完成一次 epoch 訓練後，就對該模型的效能進行驗證

    print("")
    print("Running Validation...")

    t0 = time.time()

    # 設定模型為評估模式
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # 將輸入資料載入到 gpu 中
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_token = batch[2].to(device)
        
        # 評估的時候不需要更新引數、計算梯度
        with torch.no_grad():        
            (loss, logits) = model(b_input_ids, 
                attention_mask=b_input_mask,
                token_type_ids=b_token)
        

        # 累加 loss
        total_eval_loss += loss.item()

        # 將預測結果和 labels 載入到 cpu 中計算
        logits = logits.detach().cpu().numpy()

        # 計算準確率
        total_eval_accuracy += flat_accuracy(logits)
        

    # 列印本次 epoch 的準確率
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # 統計本次 epoch 的 loss
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # 統計本次評估的時長
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # 記錄本次 epoch 的所有統計資訊
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


ValueError: ignored

In [None]:
import pandas as pd

# 保留 2 位小數
pd.set_option('precision', 2)

# 載入訓練統計到 DataFrame 中
df_stats = pd.DataFrame(data=training_stats)

# 使用 epoch 值作為每行的索引
df_stats = df_stats.set_index('epoch')

# 展示表格資料
df_stats

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 繪圖風格設定
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# 繪製學習曲線
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

## Test

In [None]:
import pandas as pd

# 載入資料集
df = pd.read_csv("/content/drive/MyDrive/NLP_final_project/cola_public/raw/out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# 列印資料集大小
print('Number of test sentences: {:,}\n'.format(df.shape[0]))
# 將資料集轉換為列表
sentences = df.sentence.values
labels = df.label.values

# 分詞、填充或截斷
input_ids = []
attention_masks = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
         sent,                      
         add_special_tokens = True, 
         max_length = 64,           
         pad_to_max_length = True,
         return_attention_mask = True,   
         return_tensors = 'pt',     
         )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

batch_size = 32  

# 準備好資料集
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# 預測測試集

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

model.eval()

# Tracking variables 
predictions , true_labels = [], []

# 預測
for batch in prediction_dataloader:
  # 將資料載入到 gpu 中
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  
  # 不需要計算梯度
  with torch.no_grad():
      # 前向傳播，獲取預測結果
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # 將結果載入到cpu
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # 儲存預測結果和labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

In [None]:
print('Positive samples: %d of %d (%.2f%%)' % (df.label.sum(), len(df.label), (df.label.sum() / len(df.label) * 100.0)))

In [None]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []

# 計算每個 batch 的 MCC
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # 計算該 batch 的 MCC  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

In [None]:
# 建立柱狀圖來顯示每個 batch 的 MCC 分數
ax = sns.barplot(x=list(range(len(matthews_set))), y=matthews_set, ci=None)

plt.title('MCC Score per Batch')
plt.ylabel('MCC Score (-1 to +1)')
plt.xlabel('Batch #')

plt.show()

In [None]:
# 合併所有 batch 的預測結果
flat_predictions = np.concatenate(predictions, axis=0)

# 取每個樣本的最大值作為預測值
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# 合併所有的 labels
flat_true_labels = np.concatenate(true_labels, axis=0)

# 計算 MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('Total MCC: %.3f' % mcc)