<a href="https://colab.research.google.com/github/seowookim/seowookim.github.io/blob/main/BERT_and_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import sklearn
import torch
from sklearn.preprocessing import LabelEncoder
import transformers
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from transformers import AutoTokenizer
import tqdm
from transformers import BertTokenizer
import nltk
from tqdm import tqdm

In [None]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Patinet Question split

data = pd.read_excel('/content/processed_distorted_sentences_fixed_v5.xlsx')
data.head()

Unnamed: 0,Id_Number,sentence,Distorted labeling
0,1625,"My 19 year old has random, out of the blue, f...",No Distortion
1,1625,My son was seven when we divorced and really w...,No Distortion
2,1625,In high school I believe my son used the “but ...,No Distortion
3,1625,I know it affects him but so out of the blue.,No Distortion
4,1625,"After his father’s death, my son spoke to some...",No Distortion


In [None]:
from huggingface_hub import login
login(token = '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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertModel
from torch.nn.utils.rnn import pad_sequence

tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")
model = BertModel.from_pretrained("mental/mental-bert-base-uncased").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.optim as optim


In [None]:
def sliding_window_tokenize(tokenizer, text, max_len = 512, stride = 256):
  encoding = tokenizer(text, return_tensors = 'pt', max_length = max_len, truncation = False)
  input_ids = encoding['input_ids'][0] #첫 번째 문장 선택

  total_length = input_ids.size(0) #시퀀스의 길이

  #윈도우 크기만큼 나누고, stride만큼 겹치게
  windows = []

  for i in range(0, total_length, stride):
        window = input_ids[i:i+max_len]  # 윈도우를 최대 길이로 자름

        # 윈도우가 max_len보다 짧을 경우 패딩 추가
        if window.size(0) < max_len:
            padding_length = max_len - window.size(0)
            padding = torch.full((padding_length,), tokenizer.pad_token_id, dtype=torch.long)  # 패딩 텐서 생성
            window = torch.cat((window, padding), dim=0)  # 패딩 추가

        windows.append(window)

        if len(window) < max_len:
            break  # 마지막 윈도우는 최대 길이보다 작을 수 있음
  return windows

In [None]:
class DistortionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len, stride):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.stride = stride

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
      sentence = str(self.sentences[idx])
      label = self.labels[idx]

      windows = sliding_window_tokenize(self.tokenizer, sentence, self.max_len, self.stride)

      total_length = self.tokenizer(sentence, return_tensors='pt')['input_ids'].size(1) #총 몇 개의 토큰인지
      max_windows = (total_length // self.stride) + 1

      padded_windows = windows + [torch.full((self.max_len,), self.tokenizer.pad_token_id, dtype=torch.long)] * (max_windows - len(windows))

      # Stack windows to create input_ids tensor
      input_ids = torch.stack(padded_windows)  # Shape: (num_windows, max_len)
      attention_mask = (input_ids != self.tokenizer.pad_token_id).long()  # Padding mask for all windows

      encoding = {
            'input_ids': input_ids,  # Shape: (num_windows, max_len)
            'attention_mask': attention_mask,  # Shape: (num_windows, max_len)
            'label': torch.tensor(label, dtype=torch.long)  # Same label for all windows
      }

      return encoding


In [None]:
# 배치 처리 시 윈도우 패딩 추가하기
from torch.nn.utils.rnn import pad_sequence

def process_batch(batch_data, pad_token_id):
    # Assume batch_data is a list of tensors [num_windows, seq_length]

    # Get max number of windows and sequence length
    max_windows = max(seq.size(0) for seq in batch_data)
    max_len = max(seq.size(1) for seq in batch_data)

    padded_sequences = []
    for seq in batch_data:
      if seq.size(0) < max_windows:
        padding_windows = torch.full((max_windows - seq.size(0), seq.size(1)), pad_token_id, dtype=seq.dtype, device=seq.device)
        seq = torch.cat([seq, padding_windows], dim=0)

      padded_windows = [torch.cat([window, torch.tensor([pad_token_id] * (max_len - window.size(0)), dtype=window.dtype, device=window.device)])
                          if window.size(0) < max_len else window
                          for window in seq]

      padded_sequences.append(torch.stack(padded_windows))
    padded_sequences = torch.stack(padded_sequences)

    return padded_sequences


In [None]:
max_len = 512
stride = 256

In [None]:
# Feature Extraction with BERT + bi-LSTM

# Feature Extraction with BERT + bi-LSTM + Sliding Window

class BertTop4BiLSTMClassifier(nn.Module):
    def __init__(self, n_classes, label_embedding_dim=32):  # label embedding은 간단하니까 작은 수
        super(BertTop4BiLSTMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("mental/mental-bert-base-uncased", output_hidden_states=True)
        self.label_embedding = nn.Embedding(n_classes, label_embedding_dim)
        self.lstm = nn.LSTM(input_size=768 * 4 + label_embedding_dim, hidden_size=512, num_layers=1, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(512 * 2, n_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, labels):

      batch_size, num_windows, seq_length = input_ids.size()
      #print(f"Input shape: {input_ids.shape}, Attention Mask shape: {attention_mask.shape}")
      # Flatten input for BERT
      input_ids = input_ids.view(batch_size * num_windows, seq_length)
      attention_mask = attention_mask.view(batch_size * num_windows, seq_length)

      # Pass the inputs to BERT
      bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
      hidden_states = bert_output.hidden_states

      # Concatenate the last 4 hidden layers from BERT
      concat_hidden_states = torch.cat(tuple(hidden_states[-4:]), dim=-1)  # Shape: (batch_size, seq_len, 768*4)

      # Add label embeddings
      label_embeddings = self.label_embedding(labels)
      label_embeddings = label_embeddings.unsqueeze(1).unsqueeze(1)  # (batch_size, 1, 1, label_embedding_dim)
      label_embeddings = label_embeddings.repeat(1, num_windows, seq_length, 1)  # (batch_size, num_windows, seq_len, label_embedding_dim)
      label_embeddings = label_embeddings.view(batch_size * num_windows, seq_length, -1)  # (batch_size * num_windows, seq_len, label_embedding_dim)

      # Combine BERT embeddings and label embeddings
      combined_embeddings = torch.cat([concat_hidden_states, label_embeddings], dim=-1)  # Shape: (batch_size, seq_len, 768*4 + label_embedding_dim)

      # Pass through LSTM
      lstm_output, _ = self.lstm(combined_embeddings)

      # Use the [CLS] token's output (first token) from LSTM
      cls_output = lstm_output[:, 0, :]  # Shape: (batch_size, lstm_hidden_dim)

      # Pass through the fully connected layer
      output = self.fc(cls_output)

      output = output.view(batch_size, num_windows, -1) # (batch_size, num_windows, output_dim)
      output = output.mean(dim=1)

      return self.softmax(output)


In [None]:
def collate_fn(batch):
    # 'input_ids'와 'attention_mask'의 길이를 맞추고 배치를 생성
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]

    # 패딩을 통해 시퀀스 길이 맞추기
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask_padded = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)

    labels = torch.stack(labels)  # 레이블은 패딩이 필요하지 않음

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_mask_padded,
        'label': labels
    }

In [None]:
sentences = data['sentence'].values
labels = data['Distorted labeling'].astype('category').cat.codes.values

In [None]:
sentences = eval_data['sentence'].values
labels = eval_data['Distorted labeling'].astype('category').cat.codes.values

In [None]:
eval_dataset = DistortionDataset(sentences, labels, tokenizer, max_len, stride)

In [None]:
eval_dataloader = DataLoader(eval_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertTop4BiLSTMClassifier(n_classes=len(set(labels))).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train_model(model, dataloader, loss_fn, optimizer, epochs=3):
    model.train()  # 모델을 학습 모드로 설정
    for epoch in range(epochs):
        running_loss = 0.0

        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}', leave=False)
        for batch in progress_bar:
            # 배치에서 데이터 추출
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # 옵티마이저 초기화
            optimizer.zero_grad()

            # 모델의 예측 값 계산
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            # 손실 계산
            loss = loss_fn(outputs, labels)

            # 역전파(Backpropagation) 및 옵티마이저 스텝
            loss.backward()
            optimizer.step()

            # 손실 추적
            running_loss += loss.item()
            progress_bar.set_postfix({'loss': running_loss / len(dataloader)})


    print('Training complete!')

In [None]:
def evaluate_model(model, dataloader):
    model.eval()  # 모델을 평가 모드로 설정
    total_correct = 0
    total_examples = 0

    with torch.no_grad():  # 평가 시에는 역전파를 하지 않음
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # 모델 출력 계산
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            # 예측 값과 실제 값 비교
            _, predicted = torch.max(outputs, dim=1)
            total_correct += (predicted == labels).sum().item()
            total_examples += labels.size(0)

    # 정확도 계산
    accuracy = total_correct / total_examples
    print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
# 데이터셋 준비
train_dataset = DistortionDataset(sentences, labels, tokenizer, max_len, stride)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True,collate_fn=collate_fn)


In [None]:
# 학습 진행
train_model(model, train_dataloader, loss_fn, optimizer)
torch.save(model, 'bert_bilstm_model_full.pth')



Training complete!


In [None]:
torch.cuda.empty_cache()

In [None]:
eval_data = pd.read_excel('/content/processed_distorted_sentences_0924.xlsx')

In [None]:
model = torch.load('bert_bilstm_model_full.pth')

# Ensure the model is on the correct device (e.g., GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

  model = torch.load('bert_bilstm_model_full.pth')


BertTop4BiLSTMClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [None]:
evaluate_model(model, eval_dataloader)

Accuracy: 0.73%
