<a href="https://colab.research.google.com/github/rlaaudrb1104/Ai/blob/PJH/0417_%ED%94%BC%EC%B3%90%EB%84%A3%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install tokenizers
!pip install pandas
!pip install torch
!pip install tqdm
!pip install scikit-learn



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer
from tqdm.auto import tqdm
import logging

In [4]:
# 로깅 설정
logger = logging.getLogger(__name__)

In [5]:
# 인자 설정 클래스
class Args:
    train_data_file = '/content/drive/MyDrive/Colab Notebooks/final_train.csv'
    eval_data_file = '/content/drive/MyDrive/Colab Notebooks/final_val.csv'
    #test_data_file = '/content/drive/MyDrive/Colab Notebooks/final_train.csv'
    output_dir = '/content/drive/My Drive/output'
    model_name_or_path = 'microsoft/graphcodebert-base'
    tokenizer_name = 'microsoft/graphcodebert-base'
    block_size = 512

args = Args()

In [6]:
# 토크나이저 설정
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# InputFeatures 클래스 정의
class InputFeatures:
    """데이터의 한 세트의 특성을 정의합니다."""
    def __init__(self, input_ids, attention_mask, cwe_type_label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.cwe_type_label = cwe_type_label

In [8]:
# 특성 변환 함수
def convert_examples_to_features(func, cwe_type_label, tokenizer, max_length):
    """코드 스니펫을 모델 입력에 적합한 특성으로 변환합니다."""
    encoding = tokenizer.encode_plus(
        text=func,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    return InputFeatures(
        input_ids=encoding['input_ids'].flatten(),
        attention_mask=encoding['attention_mask'].flatten(),
        cwe_type_label=cwe_type_label
    )

In [9]:
# TextDataset 클래스 정의
class TextDataset(Dataset):
    def __init__(self, tokenizer, args, cwe_label_map, file_type="train"):
        file_path = getattr(args, f"{file_type}_data_file")
        self.examples = []
        df = pd.read_csv(file_path)
        funcs = df["CODE"].tolist()
        cwe_type_labels = df["CWE ID"].tolist()

        for i in tqdm(range(len(funcs))):
            cwe_type_label = cwe_label_map.get(cwe_type_labels[i], 0)
            features = convert_examples_to_features(funcs[i], cwe_type_label, tokenizer, args.block_size)
            self.examples.append(features)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.examples[idx].input_ids),
            'attention_mask': torch.tensor(self.examples[idx].attention_mask),
            'labels': torch.tensor(self.examples[idx].cwe_type_label)
        }
cwe_label_map = {
    "CWE-20": 2,
    "CWE-119": 1,
    "CWE-78": 3,
    "CWE-122": 4,
    "CWE-121": 5,
    "CWE-415": 6,
    "CWE-399": 7,
    "CWE-190": 8,
    "CWE-125": 9,
    "CWE-416": 10
    # 여기에 더 많은 CWE ID와 인덱스 매핑을 추가할 수 있습니다.
}


In [10]:
# 데이터셋과 데이터 로더 생성
train_dataset = TextDataset(tokenizer, args, cwe_label_map, file_type='train')
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

  0%|          | 0/32605 [00:00<?, ?it/s]



In [15]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=10)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
batch_size = 16
epoch_num = 7
MAX_LEN = 512
learning_rate = 2e-5


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Optimizer 및 Scheduler 설정
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*epoch_num)




In [28]:
def train(model, train_dataloader, optimizer, scheduler, device):
    model.train()
    for batch in train_dataloader:
        inputs = batch['input_ids'].to(device)  # 입력 데이터
        labels = batch['labels'].to(device)     # 라벨

        optimizer.zero_grad()

        outputs = model(inputs)

        loss = compute_loss(outputs, labels)  # 손실 함수 계산

        loss.backward()
        optimizer.step()
        scheduler.step()  # scheduler 업데이트

In [29]:
from sklearn.metrics import accuracy_score  # 1. accuracy_score 임포트

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []  # 2. 리스트로 초기화
    all_labels = []  # 2. 리스트로 초기화
    with torch.no_grad():  # 3. 그래디언트 비활성화
        for batch in dataloader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

In [30]:
def test(model, test_dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy


In [31]:
def compute_logit_weight_map(model, dataloader, device):
    model.eval()
    all_outputs = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            all_outputs.extend(torch.argmax(outputs, dim=1).tolist())  # 모델 출력에서 최대값의 인덱스를 가져옴
            all_labels.extend(labels.tolist())

    logit_weight_map = {}  # 로짓 가중치를 저장할 딕셔너리
    for output, label in zip(all_outputs, all_labels):
        if output == label:  # 모델 예측과 실제 라벨이 일치하는 경우
            logit_weight_map[output] = logit_weight_map.get(output, 0) + 1  # 해당 클래스의 가중치를 1 증가시킴

    return logit_weight_map


In [32]:
model.train()

BERT(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [33]:
model.eval()

BERT(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7