<a href="https://colab.research.google.com/github/rlaaudrb1104/Ai/blob/PJH/0509_Ensemble_(bagging).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
!pip install accelerate -U
!pip install transformers[torch] -U
!pip install shap
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.

In [39]:
import numpy as np
import pandas as pd
import re
from sklearn.utils import resample
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import f1_score

In [40]:
# 데이터 파일 로드 및 전처리
def preprocess_code(df):
    df = df.drop(columns=["vul"])
    df['code'] = df['code'].apply(lambda x: re.sub(r'/\*.*?\*/', '', x))
    df['code'] = df['code'].apply(lambda x: re.sub(r'//.*?\n', '', x))
    df['code'] = df['code'].apply(lambda x: re.sub(r'#include <.*?>\n', '', x))
    df['code'] = df['code'].apply(lambda x: re.sub(r'#include \".*?\"\n', '', x))
    df['code'] = df['code'].apply(lambda x: re.sub(r'#define .*?\n', '', x))
    df['code'] = df['code'].apply(lambda x: re.sub(r'[\t ]+', ' ', x))
    df['code'] = df['code'].apply(lambda x: re.sub(r'\n\s*\n', '\n', x))
    return df

# 데이터 파일 로드
train_df = pd.read_csv('/content/drive/MyDrive/MSR+julite_Dive_final_train/MSR+julite+Div_final_train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/MSR+julite_Dive_final_train/MSR+julite+Div_final_test.csv')

# 데이터 전처리
train_df = preprocess_code(train_df)
test_df = preprocess_code(test_df)

# 데이터셋 생성 및 저장
train_df.to_csv("/content/preprocessed_train_data.csv", index=False)
test_df.to_csv("/content/preprocessed_test_data.csv", index=False)

# 데이터셋 로드
dataset = load_dataset('csv', data_files={'train': '/content/preprocessed_train_data.csv', 'test': '/content/preprocessed_test_data.csv'})


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [41]:
# 모델들의 경로
model_paths = [
    '/content/drive/MyDrive/models/codebert',
    '/content/drive/MyDrive/models/graphcodebert',
    '/content/drive/MyDrive/models/unixcoder'
]


In [42]:
# 부분 데이터셋을 사용하여 학습된 모델을 사용하여 예측을 수행하고 결과를 결합하여 최종 예측 생성
def bagging_predict(predictions):
    # 각 모델의 예측 결과를 배열로 변환
    predictions_array = np.array(predictions)

    # 각 클래스별로 투표를 진행하여 최종 예측값 결정
    final_prediction = []
    for i in range(predictions_array.shape[1]):
        class_votes = predictions_array[:, i]
        final_prediction.append(np.bincount(class_votes).argmax())

    return np.array(final_prediction)

In [43]:
# 이미 로드된 데이터셋의 train 데이터와 test 데이터를 변수에 할당합니다.
X_train = dataset['train']['code']
y_train = dataset['train']['labels']
X_test = dataset['test']['code']
y_test = dataset['test']['labels']


In [46]:
# 테스트 데이터로부터 예측 수행
predictions = []
for model_path in model_paths:
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # 최대 길이를 직접 설정
    max_length = 512  # 예시로 512로 설정
    # 최대 길이를 기반으로 토큰화하고 패딩 및 자르기 수행
    test_encodings = tokenizer(X_test, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    prediction = model(**test_encodings).logits.argmax(dim=-1)
    predictions.append(prediction)

In [47]:
# 배깅 예측 수행
final_prediction = bagging_predict(predictions)

In [48]:
# 최종 예측과 실제 레이블 간의 F1 스코어 계산
f1 = f1_score(y_test, final_prediction, average='weighted')
print("Final F1 Score:", f1)

Final F1 Score: 0.8704622062402383


In [50]:
from sklearn.metrics import f1_score

# 클래스별 F1 스코어 계산
class_f1_scores = f1_score(y_test, final_prediction, labels=np.unique(y_test), average=None)

# 클래스별 F1 스코어 출력
for i, score in enumerate(class_f1_scores):
    print(f"Class {i}: F1 Score {score}")

Class 0: F1 Score 0.7826086956521738
Class 1: F1 Score 0.8
Class 2: F1 Score 0.7499999999999999
Class 3: F1 Score 0.888888888888889
Class 4: F1 Score 0.888888888888889
Class 5: F1 Score 0.9523809523809523
Class 6: F1 Score 0.8421052631578948
Class 7: F1 Score 0.9523809523809523
Class 8: F1 Score 0.9473684210526316
Class 9: F1 Score 0.9


In [51]:
import torch

def predict_class(input_code, model, tokenizer):
    # 입력 코드를 토크나이징하고 모델 입력 형식으로 변환
    inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)

    # 모델에 입력 전달하여 클래스 예측
    with torch.no_grad():
        outputs = model(**inputs)

    # 로짓 값을 소프트맥스 함수를 통과하여 확률값으로 변환
    probabilities = torch.softmax(outputs.logits, dim=1)

    # 가장 높은 확률을 가진 클래스를 예측값으로 선택
    predicted_class = torch.argmax(probabilities, dim=1).item()

    # 각 클래스에 대한 설명
    class_labels = {
        0: "안전한 코드입니다.",
        1: "CWE-119 취약점\nImproper Restriction of Operations within the Bounds of a Memory Buffer",
        2: "CWE-20 취약점\nImproper Input Validation",
        3: "CWE-125 취약점\nOut-of-bounds Read",
        4: "CWE-787 취약점\nOut-of-bounds Write",
        5: "CWE-415 취약점\nDouble Free",
        6: "CWE-399 취약점\nResource Management Errors",
        7: "CWE-416 취약점\nUse after Free",
        8: "CWE-476 취약점\nNULL Pointer Dereference",
        9: "CWE-190 취약점\nInteger Overflow or Wraparound"
    }

    # 예측된 클래스에 해당하는 설명 반환
    predicted_class_label = class_labels[predicted_class]

    return predicted_class_label
