<a href="https://colab.research.google.com/github/rlaaudrb1104/Ai/blob/PJH/Ensemble_voting_generalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
!pip install datasets
!pip install torch
!pip install tqdm



In [37]:
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [38]:
# 데이터셋 로드
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/MSR+julite_Dive_final_train/MSR+julite+Div_final_train.csv')['train']

In [39]:
# 모델을 GPU로 이동
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_codebert.to(device)
model_graphcodebert.to(device)
model_unixcoder.to(device)

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")

In [40]:
# 데이터 토크나이징 및 포맷 설정
def tokenize_function(examples):
    return tokenizer(examples['code'], truncation=True, padding="max_length", max_length=512)
encoded_dataset = dataset.map(tokenize_function, batched=True)
encoded_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [41]:
# DataLoader 설정
data_loader = DataLoader(encoded_dataset, batch_size=16, drop_last=True)

In [42]:
# 클래스 라벨 정의
class_labels = {
    0: "안전한 코드입니다.",
    1: "CWE-119 취약점\nImproper Restriction of Operations within the Bounds of a Memory Buffer",
    2: "CWE-20 취약점\nImproper Input Validation",
    3: "CWE-125 취약점\nOut-of-bounds Read",
    4: "CWE-787 취약점\nOut-of-bounds Write",
    5: "CWE-415 취약점\nDouble Free",
    6: "CWE-399 취약점\nResource Management Errors",
    7: "CWE-416 취약점\nUse after Free",
    8: "CWE-476 취약점\nNULL Pointer Dereference",
    9: "CWE-190 취약점\nInteger Overflow or Wraparound"
}

In [43]:
# 새로운 앙상블 모델 클래스 정의
class EnsembleModel(nn.Module):
    def __init__(self, models):
        super(EnsembleModel, self).__init__()
        self.models = models

    def forward(self, input_ids, attention_mask):
        logits_sum = None
        for model in self.models:
            model.eval()
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                if logits_sum is None:
                    logits_sum = logits
                else:
                    logits_sum += logits
        logits_avg = logits_sum / len(self.models)
        return logits_avg

In [44]:
# 앙상블 모델 초기화 및 GPU로 이동
ensemble_model = EnsembleModel([model_codebert, model_graphcodebert, model_unixcoder])
ensemble_model.to(device)

EnsembleModel()

In [45]:
# 모델 저장
torch.save(ensemble_model.state_dict(), "/content/drive/MyDrive/models/ensemble_model.pth")

In [46]:
# 저장된 모델 로드 및 예측 함수
def load_ensemble_model(model_paths, device):
    models = []
    for path in model_paths:
        model = AutoModelForSequenceClassification.from_pretrained(path)
        model.to(device)
        models.append(model)
    ensemble_model = EnsembleModel(models)
    ensemble_model.load_state_dict(torch.load("/content/drive/MyDrive/models/ensemble_model.pth"))
    ensemble_model.to(device)
    return ensemble_model

In [47]:
# 테스트 코드
input_code = """
#include <stdio.h>
#include <stdlib.h>
#define BUFSIZE1 512
#define BUFSIZE2 ((BUFSIZE1/2) - 8)

int main(int argc, char **argv) {
    char *buf1R1;
    char *buf2R1;
    char *buf1R2;
    buf1R1 = (char *) malloc(BUFSIZE2);
    buf2R1 = (char *) malloc(BUFSIZE2);
    free(buf1R1);
    free(buf2R1);
    buf1R2 = (char *) malloc(BUFSIZE1);
    strncpy(buf1R2, argv[1], BUFSIZE1-1);
    free(buf2R1);
    free(buf1R2);
}
"""

In [48]:
# 저장된 모델 로드
model_paths = ["/content/drive/MyDrive/models/codebert", "/content/drive/MyDrive/models/graphcodebert", "/content/drive/MyDrive/models/unixcoder"]
ensemble_model = load_ensemble_model(model_paths, device)

In [50]:
# 저장된 모델 로드 및 예측 함수
def load_ensemble_model(model_paths, device):
    models = []
    for path in model_paths:
        model = AutoModelForSequenceClassification.from_pretrained(path)
        model.to(device)
        models.append(model)
    ensemble_model = EnsembleModel(models)
    ensemble_model.load_state_dict(torch.load("/content/drive/MyDrive/models/ensemble_model.pth"))
    ensemble_model.to(device)
    return ensemble_model

In [51]:
# 입력 코드에 대한 예측 수행 함수
def predict_class(input_code, ensemble_model, tokenizer, device):
    # 입력 코드를 토크나이징하고 모델 입력 형식으로 변환
    inputs = tokenizer(input_code, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    softmax = torch.nn.Softmax(dim=1)
    ensemble_model.eval()
    with torch.no_grad():
        logits = ensemble_model(**inputs)
        probs = softmax(logits)

    predicted_class = torch.argmax(probs, dim=1).cpu().item()
    avg_class_probs = {i: probs[0][i].item() for i in range(probs.size(1))}

    return predicted_class, avg_class_probs


In [57]:
# 테스트 코드
input_code = """
char * copy_input(char *user_supplied_string){
int i, dst_index;
char *dst_buf = (char*)malloc(4*sizeof(char) * MAX_SIZE);
if ( MAX_SIZE <= strlen(user_supplied_string) ){
die("user string too long, die evil hacker!");
}
dst_index = 0;
for ( i = 0; i < strlen(user_supplied_string); i++ ){
if( '&' == user_supplied_string[i] ){
dst_buf[dst_index++] = '&';
dst_buf[dst_index++] = 'a';
dst_buf[dst_index++] = 'm';
dst_buf[dst_index++] = 'p';
dst_buf[dst_index++] = ';';
}
else if ('<' == user_supplied_string[i] ){

/* encode to &lt; */
}
else dst_buf[dst_index++] = user_supplied_string[i];
}
return dst_buf;
}
"""

In [58]:
# 저장된 모델 로드
model_paths = ["/content/drive/MyDrive/models/codebert", "/content/drive/MyDrive/models/graphcodebert", "/content/drive/MyDrive/models/unixcoder"]
ensemble_model = load_ensemble_model(model_paths, device)

In [59]:
# 예측 수행
predicted_label, predicted_probs = predict_class(input_code, ensemble_model, tokenizer, device)
predicted_class_label_text = class_labels[predicted_label]
print(f"The predicted CWE ID is: {predicted_class_label_text}")


The predicted CWE ID is: CWE-119 취약점
Improper Restriction of Operations within the Bounds of a Memory Buffer


In [60]:
# 각 클래스의 예측 확률 출력
for cls, label in class_labels.items():
    if cls in predicted_probs:
        print(f"Probability of {label}: {predicted_probs[cls]:.4f}")

Probability of 안전한 코드입니다.: 0.0048
Probability of CWE-119 취약점
Improper Restriction of Operations within the Bounds of a Memory Buffer: 0.5151
Probability of CWE-20 취약점
Improper Input Validation: 0.1344
Probability of CWE-125 취약점
Out-of-bounds Read: 0.0362
Probability of CWE-787 취약점
Out-of-bounds Write: 0.2634
Probability of CWE-415 취약점
Double Free: 0.0044
Probability of CWE-399 취약점
Resource Management Errors: 0.0041
Probability of CWE-416 취약점
Use after Free: 0.0028
Probability of CWE-476 취약점
NULL Pointer Dereference: 0.0054
Probability of CWE-190 취약점
Integer Overflow or Wraparound: 0.0294
