In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 데이터 로드 및 토큰화


In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import numpy as np

# 데이터 로드
file_path = '/content/drive/MyDrive/bigdata_security/EDA_Dup_Oversampling.csv'
df = pd.read_csv(file_path)
# 입력 문장과 레이블 추출
sentences = df['full_log'].tolist()
labels = df['level'].tolist()

# DistilBERT 토크나이저 로드
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 문장을 토큰화하고 패딩
tokenized_texts = [tokenizer.encode_plus(
    sent,
    add_special_tokens=True,
    max_length=200,
    truncation=True,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'
) for sent in sentences]

# 입력 IDs와 어텐션 마스크 추출 및 스택
input_ids = torch.cat([item['input_ids'] for item in tokenized_texts], dim=0)
attention_masks = torch.cat([item['attention_mask'] for item in tokenized_texts], dim=0)

# 훈련 데이터와 테스트 데이터로 분리
train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks = train_test_split(
    input_ids, labels,attention_masks, test_size=0.2, random_state=42
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

# 모델 학습
## 모델 학습 기본값 = 0.85 나온 모델

In [None]:
import time
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import DistilBertForSequenceClassification, DistilBertConfig, AdamW

# 데이터가 리스트일 경우 Tensor로 변환
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(train_labels)

# 데이터로더 생성
dataset = TensorDataset(train_inputs, train_masks, train_labels)
batch_size = 64

# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터로더 및 모델을 GPU로 이동
dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),  # 무작위 샘플링
    batch_size=batch_size
)

# 모델 구성 설정
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=7)
config.max_position_embeddings = 200  # max_position_embeddings 값을 200으로 설정

model = DistilBertForSequenceClassification(config=config)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# 학습 함수 정의
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        batch_inputs, batch_masks, batch_labels = batch
        batch_inputs = batch_inputs.to(device)
        batch_masks = batch_masks.to(device)
        batch_labels = batch_labels.to(device)

        model.zero_grad()
        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'avg_loss': total_loss / len(progress_bar)})
    return total_loss / len(dataloader)

# 모델 학습 및 평가
epochs = 10
patience = 3
best_val_loss = 11111
for epoch in range(epochs):
    start_time = time.time()
    avg_train_loss = train(model, dataloader, optimizer, device)
    end_time = time.time()
    epoch_time = end_time - start_time
    print(f'Epoch {epoch+1}, Training loss: {avg_train_loss:.4f}, Time: {epoch_time:.2f} seconds')

    # Early stopping
    if avg_train_loss < best_val_loss:
        best_val_loss = avg_train_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f'Early stopping triggered after {epoch+1} epochs.')
        break


  train_inputs = torch.tensor(train_inputs)
  train_masks = torch.tensor(train_masks)
  train_labels = torch.tensor(train_labels)


Epoch 1, Training loss: 0.0736, Time: 727.23 seconds




Epoch 2, Training loss: 0.0223, Time: 732.63 seconds




Epoch 3, Training loss: 0.0182, Time: 731.85 seconds




Epoch 4, Training loss: 0.0164, Time: 732.35 seconds




Epoch 5, Training loss: 0.0139, Time: 732.22 seconds




Epoch 6, Training loss: 0.0118, Time: 731.84 seconds




Epoch 7, Training loss: 0.0105, Time: 731.61 seconds




Epoch 8, Training loss: 0.0092, Time: 731.10 seconds




Epoch 9, Training loss: 0.0078, Time: 731.55 seconds


                                                                             

Epoch 10, Training loss: 0.0059, Time: 731.99 seconds




## 모델 튜닝 값 추가한 모델 학습 - 0.86나옴

In [3]:
import time
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import DistilBertForSequenceClassification, DistilBertConfig, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

# 데이터가 리스트일 경우 Tensor로 변환
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(train_labels)

# 데이터로더 생성
dataset = TensorDataset(train_inputs, train_masks, train_labels)
batch_size = 35  # 설정 파일에서 가져온 batch_size

# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터로더 및 모델을 GPU로 이동
dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),  # 무작위 샘플링
    batch_size=batch_size
)

# 모델 구성 설정
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
config.max_position_embeddings = 200  # max_position_embeddings 값을 200으로 설정

model = DistilBertForSequenceClassification(config=config)
model.to(device)

# Optimizer 설정 파일에서 AdamW로 설정
optimizer = AdamW(model.parameters(), lr=1e-5)  # 설정 파일에서 lr 가져옴

# Scheduler 설정 파일에서 ReduceLROnPlateau로 설정
scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=3, verbose=True)

# Loss 함수 (여기서는 focal loss를 사용한다고 가정합니다)
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = torch.nn.functional.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()

criterion = FocalLoss(gamma=2.0)  # 설정 파일에서 gamma 가져옴

# 학습 함수 정의
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        batch_inputs, batch_masks, batch_labels = batch
        batch_inputs = batch_inputs.to(device)
        batch_masks = batch_masks.to(device)
        batch_labels = batch_labels.to(device)

        model.zero_grad()
        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits
        loss = criterion(logits, batch_labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'avg_loss': total_loss / len(progress_bar)})
    return total_loss / len(dataloader)

# 모델 학습 및 평가
epochs = 10  # 설정 파일에서 가져온 max_epochs
patience = 3  # 설정 파일에서 scheduler의 patience 가져옴
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(epochs):
    start_time = time.time()
    avg_train_loss = train(model, dataloader, optimizer, criterion, device)
    end_time = time.time()
    epoch_time = end_time - start_time
    print(f'Epoch {epoch+1}, Training loss: {avg_train_loss:.4f}, Time: {epoch_time:.2f} seconds')

    # Scheduler step
    scheduler.step(avg_train_loss)

    # Early stopping
    if avg_train_loss < best_val_loss:
        best_val_loss = avg_train_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f'Early stopping triggered after {epoch+1} epochs.')
        break


  train_inputs = torch.tensor(train_inputs)
  train_masks = torch.tensor(train_masks)


Epoch 1, Training loss: 0.0359, Time: 767.93 seconds




Epoch 2, Training loss: 0.0096, Time: 770.80 seconds




Epoch 3, Training loss: 0.0069, Time: 769.56 seconds




Epoch 4, Training loss: 0.0058, Time: 768.28 seconds




Epoch 5, Training loss: 0.0055, Time: 768.78 seconds




Epoch 6, Training loss: 0.0048, Time: 769.39 seconds




Epoch 7, Training loss: 0.0041, Time: 767.12 seconds




Epoch 8, Training loss: 0.0034, Time: 758.63 seconds




Epoch 9, Training loss: 0.0029, Time: 768.68 seconds


                                                                               

Epoch 10, Training loss: 0.0026, Time: 768.59 seconds




## 모델 저장

In [4]:
final_model_save_path = '/content/drive/MyDrive/final_model_save_200_tuned_for7'
model.save_pretrained(final_model_save_path)
print(f'Final model saved to {final_model_save_path}')

Final model saved to /content/drive/MyDrive/final_model_save_200_tuned_for7


## 모델 평가
- 과대적합남

In [17]:
test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)
test_labels = torch.tensor(test_labels)

dataset = TensorDataset(test_inputs, test_masks, test_labels)
batch_size = 64

dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset), # 무작위 샘플링
    batch_size=batch_size
)

# 평가 함수 정의
def evaluate(model, test_dataloader, device):
    model.eval()
    preds, true_labels = [], []
    for batch in test_dataloader:
        batch_inputs, batch_masks, batch_labels = [b.to(device) for b in batch]
        with torch.no_grad():
            outputs = model(input_ids=batch_inputs, attention_mask=batch_masks)
        logits = outputs.logits
        preds.append(logits.argmax(dim=1).cpu().numpy())
        true_labels.append(batch_labels.cpu().numpy())
    preds = np.concatenate(preds, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    accuracy = accuracy_score(true_labels, preds)
    return accuracy

accuracy = evaluate(model, dataloader, device)
print(f'Test Accuracy: {accuracy}')

  test_inputs = torch.tensor(test_inputs)
  test_masks = torch.tensor(test_masks)


Test Accuracy: 0.9905510494188478


# 제출물 생성

In [2]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from transformers import DistilBertForSequenceClassification, DistilBertConfig
from safetensors.torch import load_file
import os
import torch

# 모델 구성 로드
model_save_path = '/content/drive/MyDrive/final_model_save_200_tuned_for7'

# Config 파일을 저장된 경로에서 로드
config_path = os.path.join(model_save_path, 'config.json')
config = DistilBertConfig.from_pretrained(config_path)

# 모델 생성
model_v = DistilBertForSequenceClassification(config)
state_dict = load_file(os.path.join(model_save_path, 'model.safetensors'))

# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 가중치 로드
model_v.load_state_dict(state_dict)
model_v.to(device)  # GPU 사용 시
print("Model loaded successfully")


Model loaded successfully


## submission token화 데이터 로드

In [7]:
import torch

# 파일에서 텐서 로드
attention_masks = torch.load('/content/drive/MyDrive/bigdata_security/attendtion_masks.pt')
input_ids = torch.load('/content/drive/MyDrive/bigdata_security/input_ids.pt')

## 저장할 dataframe 생성

In [8]:

predict_val = []
import pandas as pd
# 새로운 데이터프레임 생성
new_df = pd.DataFrame(columns = ['id', 'level'])
# new_df = pd.read_csv('/content/drive/MyDrive/bigdata_security/model_submission/bert_trained_7data.csv')
start_id = 1000000
print(len(new_df))
model_v.config.max_position_embeddings

0


200

## Predict 수행 후 저장

In [None]:
import torch
import numpy as np
import pandas as pd
import time
import torch.nn.functional as F
import random

# Ensure that all tensors and the model are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_v.to(device)
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
batch_size = 100

model_v.eval()
for i in range(len(new_df), len(input_ids), batch_size):
    start_id = 1000000+i
    if random.randint(0,100) == 0:
      print(f'{i}_th data is evaluating')
      new_df.to_csv('/content/drive/MyDrive/bigdata_security/model_submission/final_model_save_200_tuned_for7.csv', index=False)
    predictions = []

    with torch.no_grad():
        outputs = model_v(input_ids[i:i+batch_size], attention_mask=attention_masks[i:i+batch_size])
        logits = outputs.logits

    # Softmax 함수를 Numpy로 대체하여 확률 계산
    exp_logits = F.softmax(logits, dim=-1)
    # Numpy로 argmax 대체
    max_indices = np.argmax(exp_logits.cpu().numpy(), axis=1)
    max_probs = np.max(exp_logits.cpu().numpy(), axis=1)

    predictions = max_indices
    # predictions = np.where(max_probs < 0.66, 7, max_indices)
    prediction_id = np.arange(start_id, start_id +  len(predictions))

    predictions_df = pd.DataFrame({'id': prediction_id, 'level': predictions})

    new_df = pd.concat([new_df, predictions_df], ignore_index=True)
new_df.to_csv('/content/drive/MyDrive/bigdata_security/model_submission/final_model_save_200_tuned_for7.csv', index=False)


2300_th data is evaluating
4000_th data is evaluating
9300_th data is evaluating
11000_th data is evaluating
32000_th data is evaluating
38100_th data is evaluating
39200_th data is evaluating
43500_th data is evaluating
48700_th data is evaluating
61900_th data is evaluating
79600_th data is evaluating
81000_th data is evaluating
85400_th data is evaluating
105100_th data is evaluating
105200_th data is evaluating
110600_th data is evaluating
114700_th data is evaluating
123700_th data is evaluating
140500_th data is evaluating
145500_th data is evaluating
149400_th data is evaluating
166300_th data is evaluating
174500_th data is evaluating
187300_th data is evaluating
194700_th data is evaluating
229100_th data is evaluating
236000_th data is evaluating
260500_th data is evaluating
273900_th data is evaluating
287400_th data is evaluating
292100_th data is evaluating
295100_th data is evaluating
313100_th data is evaluating
315400_th data is evaluating
316500_th data is evaluating
3

In [None]:
new_df.to_csv('/content/drive/MyDrive/bigdata_security/model_submission/final_model_save_200_tuned_for7Z.csv', index=False)


In [None]:

new_df['level'].value_counts()