<a href="https://colab.research.google.com/github/rlaaudrb1104/Ai/blob/KMG/stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets
!pip install accelerate
!pip install transformers==4.30
!pip install pytorch



In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset
import re
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [4]:
# 모델과 토크나이저 로드
tokenizer_codebert = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model_codebert = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels = 11)

tokenizer_codebert_mlm = RobertaTokenizer.from_pretrained("microsoft/codebert-base-mlm")
model_codebert_mlm = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base-mlm", num_labels = 11)

tokenizer_graphcodebert = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
model_graphcodebert = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels = 11)

tokenizer_unixcoder = RobertaTokenizer.from_pretrained("microsoft/unixcoder-base-nine")
model_unixcoder = RobertaForSequenceClassification.from_pretrained("microsoft/unixcoder-base-nine", num_labels = 11)

batch_size = 16
epoch_num = 7
MAX_LEN = 512
learning_rate = 2e-5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializi

In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

df = pd.read_csv("/content/drive/MyDrive/Dataset/MSR/Last.csv")

In [6]:
def preprocess(df,file_name):
  #preprocess_df = df.replace(re.compile('(^import.*|^from.*)',re.MULTILINE),"",regex=True) #import,from 없애기
  preprocess_df = df.replace(re.compile('(#.*)', re.MULTILINE),"",regex=True) #주석 한 줄
  preprocess_df = preprocess_df.replace(re.compile('[\'\"]{3}.*?[\'\"]{3}', re.DOTALL),"",regex=True) #주석 여러줄
  preprocess_df = preprocess_df.replace(re.compile('[\n]{2,}', re.MULTILINE),"\n",regex=True) #다중개행 한번으로
  preprocess_df = preprocess_df.replace(re.compile('[ ]{4}', re.MULTILINE),"\t",regex=True) #tab 변환
  preprocess_df = preprocess_df.replace(re.compile('[ ]{1,3}', re.MULTILINE)," ",regex=True) #공백 여러개 변환
  preprocess_df.to_csv(file_name)

def tokenized(examples):
  return tokenizer_graphcodebert(examples['code'], padding=True, max_length=MAX_LEN,truncation=True, return_token_type_ids=True)

In [7]:
preprocess(df,"preprocess.csv")
dataset = load_dataset("csv",data_files="preprocess.csv")['train']
encoded_dataset = dataset.map(lambda x: tokenizer_graphcodebert(x['code'], padding='max_length', truncation=True, max_length=MAX_LEN), batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/18616 [00:00<?, ? examples/s]

In [8]:
encoded_dataset = encoded_dataset.train_test_split(0.3,seed=100)

In [9]:
X_train = np.array(encoded_dataset['train']['input_ids'])
y_train = np.array(encoded_dataset['train']['labels'])
X_test = np.array(encoded_dataset['test']['input_ids'])
y_test = np.array(encoded_dataset['test']['labels'])

In [10]:
# CV 기반 스태킹 학습을 위한 함수 정의
def get_stacking_base_predictions(model, X_train_n, y_train_n, X_test_n, n_folds):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    train_fold_pred = np.zeros((len(X_train_n), model.config.num_labels))
    test_pred = np.zeros((len(X_test_n), n_folds, model.config.num_labels))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_n, y_train_n)):
        X_train_fold, X_val_fold = X_train_n[train_idx], X_train_n[val_idx]
        y_train_fold, y_val_fold = y_train_n[train_idx], y_train_n[val_idx]

        train_inputs = torch.tensor(X_train_fold)
        train_labels = torch.tensor(y_train_fold)
        val_inputs = torch.tensor(X_val_fold)
        val_labels = torch.tensor(y_val_fold)
        test_inputs = torch.tensor(X_test_n)

        train_data = TensorDataset(train_inputs, train_labels)
        val_data = TensorDataset(val_inputs, val_labels)
        test_data = TensorDataset(test_inputs)

        train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=16, shuffle=False)
        test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

        optimizer = optim.AdamW(model.parameters(), lr=2e-5)
        criterion = nn.CrossEntropyLoss()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        for epoch in range(3):  # 예시로 3 에폭만 학습
            train_loss, train_f1, val_f1 = train_model(model, train_loader, val_loader, optimizer, criterion, device)

        model.eval()
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                inputs = data[0].to(device)
                outputs = model(inputs)[0]
                test_pred[i * 16:(i + 1) * 16, fold, :] = outputs.cpu().numpy()

    return train_fold_pred, test_pred.mean(axis=1)

In [15]:
from sklearn.metrics import f1_score
from tqdm import tqdm

import torch
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

def train_model(model, train_loader, val_loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    train_predictions = []
    train_targets = []

    for inputs, labels in tqdm(train_loader, desc="Training", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)[0]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

        predictions = outputs.argmax(dim=1)
        train_predictions.extend(predictions.tolist())
        train_targets.extend(labels.tolist())

    # 각 클래스별 F1 점수 계산
    train_f1 = f1_score(train_targets, train_predictions, average=None)
    formatted_train_f1 = ", ".join(f"{score:.4f}" for score in train_f1)

    val_predictions = []
    val_targets = []

    for inputs, labels in tqdm(val_loader, desc="Validation", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)[0]
        predictions = outputs.argmax(dim=1)
        val_predictions.extend(predictions.tolist())
        val_targets.extend(labels.tolist())

    val_f1 = f1_score(val_targets, val_predictions, average=None)
    formatted_val_f1 = ", ".join(f"{score:.4f}" for score in val_f1)

    # 각 클래스별 F1 점수를 포맷하여 출력
    print(f"Train Loss: {running_loss / len(train_loader.dataset):.4f}, Train F1 by Class: [{formatted_train_f1}], Val F1 by Class: [{formatted_val_f1}]")

    return running_loss / len(train_loader.dataset), train_f1, val_f1


In [16]:
codebert_train_pred, codebert_test_pred = get_stacking_base_predictions(model_codebert, X_train, y_train, X_test, n_folds=3)
codebert_mlm_train_pred, codebert_mlm_test_pred = get_stacking_base_predictions(model_codebert_mlm, X_train, y_train, X_test, n_folds=3)
graphcodebert_train_pred, graphcodebert_test_pred = get_stacking_base_predictions(model_graphcodebert, X_train, y_train, X_test, n_folds=3)



Train Loss: 0.7279, Train F1 by Class: [0.8749, 0.5660, 0.4079, 0.9337, 0.8757, 0.8617, 0.5352, 0.9213, 0.8626, 0.2408, 0.7493], Val F1 by Class: [0.9072, 0.5965, 0.4477, 0.9533, 0.8791, 0.8954, 0.5401, 0.9489, 0.8747, 0.3451, 0.7477]




Train Loss: 0.5656, Train F1 by Class: [0.9304, 0.6679, 0.5476, 0.9527, 0.8800, 0.8690, 0.6806, 0.9364, 0.8931, 0.4990, 0.7561], Val F1 by Class: [0.9256, 0.5865, 0.5411, 0.9509, 0.8752, 0.8956, 0.6430, 0.9573, 0.8947, 0.5135, 0.7636]




Train Loss: 0.4108, Train F1 by Class: [0.9636, 0.7635, 0.7105, 0.9455, 0.8936, 0.8848, 0.7875, 0.9494, 0.9125, 0.6771, 0.8006], Val F1 by Class: [0.9379, 0.6798, 0.6148, 0.9474, 0.8808, 0.9052, 0.7060, 0.9639, 0.9062, 0.5814, 0.7153]




Train Loss: 0.4136, Train F1 by Class: [0.9602, 0.7703, 0.7287, 0.9467, 0.8847, 0.8857, 0.7957, 0.9540, 0.9220, 0.6971, 0.8076], Val F1 by Class: [0.9734, 0.8256, 0.8253, 0.9633, 0.9099, 0.8803, 0.8761, 0.9405, 0.9410, 0.7610, 0.8548]




Train Loss: 0.2959, Train F1 by Class: [0.9723, 0.8521, 0.8309, 0.9462, 0.8855, 0.8869, 0.8611, 0.9606, 0.9253, 0.8043, 0.8642], Val F1 by Class: [0.9768, 0.8120, 0.8053, 0.9591, 0.8997, 0.8838, 0.8624, 0.9521, 0.9368, 0.7918, 0.8821]




Train Loss: 0.2205, Train F1 by Class: [0.9793, 0.8815, 0.8901, 0.9458, 0.8823, 0.8826, 0.9026, 0.9634, 0.9549, 0.8738, 0.8972], Val F1 by Class: [0.9676, 0.8246, 0.7894, 0.9600, 0.9055, 0.8919, 0.8392, 0.9481, 0.9344, 0.7680, 0.8564]




Train Loss: 0.2457, Train F1 by Class: [0.9750, 0.8671, 0.8475, 0.9568, 0.8968, 0.8924, 0.8775, 0.9639, 0.9488, 0.8427, 0.8925], Val F1 by Class: [0.9929, 0.9360, 0.9330, 0.9432, 0.8711, 0.8759, 0.9562, 0.9685, 0.9828, 0.9098, 0.9398]




Train Loss: 0.1737, Train F1 by Class: [0.9831, 0.9094, 0.9187, 0.9591, 0.8999, 0.8998, 0.9294, 0.9667, 0.9703, 0.8953, 0.9328], Val F1 by Class: [0.9930, 0.8146, 0.8929, 0.9457, 0.9074, 0.8832, 0.9130, 0.9698, 0.9533, 0.9171, 0.9091]




Train Loss: 0.1398, Train F1 by Class: [0.9828, 0.9408, 0.9444, 0.9580, 0.9004, 0.8999, 0.9440, 0.9705, 0.9753, 0.9408, 0.9642], Val F1 by Class: [0.9894, 0.9275, 0.9127, 0.9436, 0.8824, 0.8731, 0.9208, 0.9583, 0.9693, 0.9386, 0.9333]




Train Loss: 1.2883, Train F1 by Class: [0.6292, 0.4124, 0.2847, 0.7821, 0.6128, 0.6422, 0.3333, 0.6920, 0.6363, 0.0115, 0.5426], Val F1 by Class: [0.8700, 0.5207, 0.4402, 0.9533, 0.8857, 0.8613, 0.5150, 0.9309, 0.7574, 0.1419, 0.7217]




Train Loss: 0.6788, Train F1 by Class: [0.9109, 0.5572, 0.4441, 0.9519, 0.8924, 0.8726, 0.5809, 0.9343, 0.8609, 0.2972, 0.7683], Val F1 by Class: [0.9370, 0.6358, 0.5236, 0.9533, 0.8953, 0.9082, 0.6262, 0.9679, 0.8889, 0.4638, 0.7485]




Train Loss: 0.5003, Train F1 by Class: [0.9522, 0.7077, 0.6079, 0.9494, 0.8951, 0.8783, 0.7342, 0.9500, 0.9112, 0.5525, 0.7775], Val F1 by Class: [0.9393, 0.5783, 0.6011, 0.9545, 0.8929, 0.9047, 0.6861, 0.9695, 0.8939, 0.5385, 0.7376]




Train Loss: 0.4346, Train F1 by Class: [0.9605, 0.7302, 0.7032, 0.9517, 0.8876, 0.8913, 0.7852, 0.9610, 0.9119, 0.6600, 0.7968], Val F1 by Class: [0.9715, 0.7762, 0.7819, 0.9635, 0.9098, 0.9007, 0.8389, 0.9605, 0.9516, 0.6404, 0.8649]




Train Loss: 0.3016, Train F1 by Class: [0.9771, 0.8301, 0.8088, 0.9471, 0.8986, 0.8936, 0.8674, 0.9646, 0.9351, 0.7844, 0.8675], Val F1 by Class: [0.9769, 0.7702, 0.7644, 0.9519, 0.8937, 0.8866, 0.8537, 0.9565, 0.9419, 0.7208, 0.8314]




Train Loss: 0.2112, Train F1 by Class: [0.9824, 0.8870, 0.8794, 0.9449, 0.8985, 0.8991, 0.9156, 0.9719, 0.9549, 0.8730, 0.9178], Val F1 by Class: [0.9722, 0.8126, 0.7981, 0.9636, 0.8994, 0.8895, 0.8329, 0.9531, 0.9555, 0.7821, 0.8608]




Train Loss: 0.2632, Train F1 by Class: [0.9757, 0.8460, 0.8595, 0.9562, 0.9048, 0.9000, 0.8727, 0.9638, 0.9549, 0.8199, 0.8827], Val F1 by Class: [0.9916, 0.9405, 0.9471, 0.9481, 0.8944, 0.8946, 0.9634, 0.9730, 0.9848, 0.9412, 0.9750]




Train Loss: 0.1684, Train F1 by Class: [0.9817, 0.9124, 0.9241, 0.9598, 0.8988, 0.9024, 0.9299, 0.9663, 0.9790, 0.9042, 0.9377], Val F1 by Class: [0.9951, 0.9456, 0.9336, 0.9469, 0.9049, 0.8895, 0.9537, 0.9666, 0.9690, 0.9201, 0.9188]




Train Loss: 0.1240, Train F1 by Class: [0.9838, 0.9456, 0.9574, 0.9598, 0.9102, 0.9110, 0.9568, 0.9704, 0.9809, 0.9389, 0.9681], Val F1 by Class: [0.9908, 0.9136, 0.8915, 0.9481, 0.9046, 0.8910, 0.9532, 0.9667, 0.9732, 0.8871, 0.9479]




Train Loss: 1.2980, Train F1 by Class: [0.6639, 0.3906, 0.3120, 0.8454, 0.5773, 0.5341, 0.3905, 0.6789, 0.5072, 0.0076, 0.4398], Val F1 by Class: [0.8756, 0.5597, 0.4388, 0.9347, 0.8900, 0.9061, 0.4552, 0.9524, 0.7745, 0.0588, 0.7462]




Train Loss: 0.6733, Train F1 by Class: [0.9064, 0.5871, 0.4478, 0.9491, 0.8962, 0.8836, 0.5947, 0.9342, 0.8745, 0.2331, 0.7692], Val F1 by Class: [0.9245, 0.6126, 0.5182, 0.9522, 0.8830, 0.9095, 0.6477, 0.9607, 0.8988, 0.4719, 0.7335]




Train Loss: 0.4976, Train F1 by Class: [0.9550, 0.6957, 0.5866, 0.9464, 0.9003, 0.8827, 0.7489, 0.9527, 0.9098, 0.5491, 0.7889], Val F1 by Class: [0.9437, 0.6672, 0.5984, 0.9548, 0.8892, 0.9001, 0.6553, 0.9551, 0.9061, 0.5534, 0.7596]




Train Loss: 0.4514, Train F1 by Class: [0.9612, 0.7201, 0.6582, 0.9460, 0.8888, 0.8968, 0.7624, 0.9572, 0.9123, 0.6362, 0.7973], Val F1 by Class: [0.9742, 0.7419, 0.7027, 0.9611, 0.9182, 0.9032, 0.8364, 0.9633, 0.9234, 0.6334, 0.8264]




Train Loss: 0.3025, Train F1 by Class: [0.9793, 0.8249, 0.7910, 0.9472, 0.9067, 0.9140, 0.8748, 0.9676, 0.9332, 0.7695, 0.8688], Val F1 by Class: [0.9661, 0.7798, 0.7294, 0.9599, 0.9195, 0.8928, 0.7781, 0.9540, 0.9386, 0.7064, 0.8310]




Train Loss: 0.2119, Train F1 by Class: [0.9853, 0.8833, 0.8749, 0.9447, 0.8968, 0.8985, 0.9176, 0.9670, 0.9546, 0.8719, 0.9178], Val F1 by Class: [0.9735, 0.7738, 0.7553, 0.9576, 0.9133, 0.8907, 0.8070, 0.9446, 0.9077, 0.7028, 0.8366]




Train Loss: 0.2573, Train F1 by Class: [0.9765, 0.8506, 0.8389, 0.9587, 0.8998, 0.9040, 0.8683, 0.9625, 0.9549, 0.8268, 0.8863], Val F1 by Class: [0.9923, 0.9510, 0.9327, 0.9481, 0.8915, 0.8945, 0.9603, 0.9652, 0.9709, 0.9447, 0.9559]




Train Loss: 0.1686, Train F1 by Class: [0.9845, 0.9167, 0.9155, 0.9642, 0.9111, 0.9105, 0.9251, 0.9675, 0.9693, 0.8979, 0.9401], Val F1 by Class: [0.9806, 0.9305, 0.9305, 0.9498, 0.8785, 0.8736, 0.9470, 0.9636, 0.9605, 0.9274, 0.9409]




Train Loss: 0.1199, Train F1 by Class: [0.9856, 0.9530, 0.9551, 0.9596, 0.9068, 0.9051, 0.9587, 0.9762, 0.9800, 0.9442, 0.9692], Val F1 by Class: [0.9916, 0.9245, 0.8925, 0.9444, 0.9024, 0.8906, 0.9358, 0.9695, 0.9710, 0.9054, 0.9125]


In [17]:
# 개별 모델의 예측을 합쳐서 메타 피처로 사용
stacked_train_features = np.hstack((codebert_train_pred, codebert_mlm_train_pred, graphcodebert_train_pred))
stacked_test_features = np.hstack((codebert_test_pred, codebert_mlm_test_pred, graphcodebert_test_pred))

In [22]:
stacked_train_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 메타 모델로 사용할 Unixcoder 모델 정의
meta_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)

# 메타 모델 학습
meta_model.fit(stacked_train_features, y_train)

# 메타 모델 예측
meta_pred = meta_model.predict(stacked_test_features)

# 정확도 평가
accuracy = accuracy_score(y_test, meta_pred)
print(f"Accuracy of meta model: {accuracy:.2f}")

# 클래스 별 평가
report = classification_report(y_test, meta_pred, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4', 'Class 5', 'Class 6', 'Class 7', 'Class 8', 'Class 9', 'Class 10'])
print(report)

Accuracy of meta model: 0.15
              precision    recall  f1-score   support

     Class 0       0.15      1.00      0.27       865
     Class 1       0.00      0.00      0.00       628
     Class 2       0.00      0.00      0.00       647
     Class 3       0.00      0.00      0.00       554
     Class 4       0.00      0.00      0.00       496
     Class 5       0.00      0.00      0.00       508
     Class 6       0.00      0.00      0.00       507
     Class 7       0.00      0.00      0.00       509
     Class 8       0.00      0.00      0.00       329
     Class 9       0.00      0.00      0.00       303
    Class 10       0.00      0.00      0.00       239

    accuracy                           0.15      5585
   macro avg       0.01      0.09      0.02      5585
weighted avg       0.02      0.15      0.04      5585



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
