In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import torch

import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

## 데이터 전처리

In [3]:
df = pd.read_csv('dataset.csv')
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1053 entries, 0 to 1052
Data columns (total 57 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   switch-type                          1053 non-null   object 
 1   max-memory-inhibit                   1053 non-null   bool   
 2   max-sym-array-size                   1053 non-null   int64  
 3   max-depth                            1053 non-null   int64  
 4   watchdog                             1053 non-null   bool   
 5   max-static-fork-pct                  1053 non-null   float64
 6   const-array-opt                      1053 non-null   bool   
 7   zero-seed-extension                  1053 non-null   bool   
 8   smtlib-display-constants             1053 non-null   object 
 10  smtlib-human-readable                1053 non-null   bool   
 11  warn-all-external-symbols            1053 non-null   bool   
 12  use-iterative-deepening-time-s

Unnamed: 0,switch-type,max-memory-inhibit,max-sym-array-size,max-depth,watchdog,max-static-fork-pct,const-array-opt,zero-seed-extension,smtlib-display-constants,warnings-only-to-file,...,cex-cache-superset,verify-each,max-memory,batch-time,max-static-solve-pct,max-static-cpfork-pct,max-static-cpsolve-pct,array-value-symb-ratio,sym-stdin,Coverage
0,internal,False,3980,1142,True,0.159269,True,False,bin,False,...,False,False,1136,4.0,0.392040,0.197067,0.029739,0.896605,4,0
1,llvm,True,6067,2189,True,0.798309,True,True,bin,False,...,False,True,2040,7.0,0.848477,0.372759,0.209427,0.780304,11,1850
2,llvm,False,3104,646,True,0.238140,True,False,dec,True,...,False,True,1098,4.0,0.922272,0.613608,0.533229,0.582163,4,2562
3,internal,False,5791,1579,False,0.641675,False,True,dec,False,...,False,False,1242,6.0,0.464663,0.849170,0.521843,0.401757,4,1296
4,internal,True,2356,537,True,0.893913,False,True,bin,True,...,True,True,711,4.0,0.538513,0.410924,0.766638,0.351363,8,1281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048,simple,False,329279,181009,True,0.480900,False,False,bin,True,...,True,False,59068,8.0,0.926000,0.766700,0.604200,0.059100,19,1396
1049,simple,True,199941,1376,False,0.959500,False,False,hex,True,...,True,False,44955,4.0,0.377300,0.849700,0.431100,0.802000,50,1866
1050,simple,False,196349,153606,True,0.008200,True,False,dec,False,...,False,True,46454,4.0,0.844500,0.783000,0.138100,0.579200,29,1939
1051,internal,False,5548,2335,True,0.803200,True,False,dec,False,...,True,True,59204,44.0,0.178000,0.014700,0.957900,0.110700,16,1686


In [6]:
categorical_cols = df.select_dtypes(include=['object']).columns
boolean_cols = df.select_dtypes(include=['bool']).columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
# 각 열 유형에 맞는 변환기 설정
categorical_transformer = OneHotEncoder(drop='first')  # 첫 번째 범주를 제거하여 더미 트랩 방지
boolean_transformer = OneHotEncoder(drop='if_binary')  # 부울 값을 0과 1로 인코딩
numeric_transformer = StandardScaler()  # 평균 0, 표준편차 1로 정규화

In [5]:
# 전처리 파이프라인 구성
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),    # 수치형 데이터에 대한 정규화
        ('cat', categorical_transformer, categorical_cols),  # 범주형 데이터에 대한 원-핫 인코딩
        ('bool', boolean_transformer, boolean_cols)    # 부울 데이터에 대한 인코딩
    ]
)

# 데이터 전처리 실행
data_preprocessed = preprocessor.fit_transform(df)

# 전처리된 데이터 확인 (예: shape 출력)
print(data_preprocessed.shape)
df

(1053, 60)


Unnamed: 0,switch-type,max-memory-inhibit,max-sym-array-size,max-depth,watchdog,max-static-fork-pct,const-array-opt,zero-seed-extension,smtlib-display-constants,warnings-only-to-file,...,cex-cache-superset,verify-each,max-memory,batch-time,max-static-solve-pct,max-static-cpfork-pct,max-static-cpsolve-pct,array-value-symb-ratio,sym-stdin,Coverage
0,internal,False,3980,1142,True,0.159269,True,False,bin,False,...,False,False,1136,4.0,0.392040,0.197067,0.029739,0.896605,4,0
1,llvm,True,6067,2189,True,0.798309,True,True,bin,False,...,False,True,2040,7.0,0.848477,0.372759,0.209427,0.780304,11,1850
2,llvm,False,3104,646,True,0.238140,True,False,dec,True,...,False,True,1098,4.0,0.922272,0.613608,0.533229,0.582163,4,2562
3,internal,False,5791,1579,False,0.641675,False,True,dec,False,...,False,False,1242,6.0,0.464663,0.849170,0.521843,0.401757,4,1296
4,internal,True,2356,537,True,0.893913,False,True,bin,True,...,True,True,711,4.0,0.538513,0.410924,0.766638,0.351363,8,1281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048,simple,False,329279,181009,True,0.480900,False,False,bin,True,...,True,False,59068,8.0,0.926000,0.766700,0.604200,0.059100,19,1396
1049,simple,True,199941,1376,False,0.959500,False,False,hex,True,...,True,False,44955,4.0,0.377300,0.849700,0.431100,0.802000,50,1866
1050,simple,False,196349,153606,True,0.008200,True,False,dec,False,...,False,True,46454,4.0,0.844500,0.783000,0.138100,0.579200,29,1939
1051,internal,False,5548,2335,True,0.803200,True,False,dec,False,...,True,True,59204,44.0,0.178000,0.014700,0.957900,0.110700,16,1686


In [65]:
data_preprocessed

array([[-0.93408178, -0.9644587 , -1.19842303, ...,  0.        ,
         0.        ,  0.        ],
       [-0.91836893, -0.9484951 ,  1.11306457, ...,  1.        ,
         0.        ,  1.        ],
       [-0.94067711, -0.97202121, -0.91313516, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.51424841,  1.36015939, -1.7448591 , ...,  1.        ,
         0.        ,  1.        ],
       [-0.92227644, -0.94626904,  1.13075699, ...,  0.        ,
         1.        ,  1.        ],
       [ 1.31570913,  1.21484026, -0.35588227, ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split

# 데이터와 타겟을 함께 분할하여 인덱스 유지
X_train_val, X_test, y_train_val, y_test = train_test_split(
    data_preprocessed, df['Coverage'].values, test_size=0.1, random_state=42, shuffle=True)
    

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.22, random_state=42)  # 0.22 ≈ 2/9

# PyTorch 텐서로 변환
train_tensor_x = torch.Tensor(X_train)
train_tensor_y = torch.Tensor(y_train)
val_tensor_x = torch.Tensor(X_val)
val_tensor_y = torch.Tensor(y_val)
test_tensor_x = torch.Tensor(X_test)
test_tensor_y = torch.Tensor(y_test)

# TensorDataset 생성
train_dataset = TensorDataset(train_tensor_x, train_tensor_y)
val_dataset = TensorDataset(val_tensor_x, val_tensor_y)
test_dataset = TensorDataset(test_tensor_x, test_tensor_y)

# DataLoader 생성
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


## Baseline 모델 구성 및 학습

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
# 디바이스 설정: Mac에서 pytorch 사용
device = torch.device("mps")


# 타겟 데이터 로그 변환
train_log_y = np.log1p(train_tensor_y.numpy())
val_log_y = np.log1p(val_tensor_y.numpy())
test_log_y = np.log1p(test_tensor_y.numpy())


# TensorDataset 재생성
train_dataset = TensorDataset(train_tensor_x, torch.Tensor(train_log_y))
val_dataset = TensorDataset(val_tensor_x, torch.Tensor(val_log_y))
test_dataset = TensorDataset(test_tensor_x, torch.Tensor(test_log_y))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_prob=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        x = x.unsqueeze(1)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out

# 모델 하이퍼파라미터
input_dim = 60  # 특성 수
hidden_dim = 256
output_dim = 1  
num_layers = 1
batch_size = 64

def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

        # 검증 단계
        model.eval()
        with torch.no_grad():
            val_loss = 0
            val_predictions = []
            val_targets = []
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                val_loss += loss_fn(outputs.squeeze(), targets).item()
                val_predictions.extend(outputs.squeeze().cpu().numpy())
                val_targets.extend(targets.cpu().numpy())
            val_mae = mean_absolute_error(val_targets, val_predictions)
            val_rmse = np.sqrt(mean_squared_error(val_targets, val_predictions))
            
            print(f'Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {val_loss / len(val_loader)}, Val MAE: {val_mae}, Val RMSE: {val_rmse}')

# print(f'Validation Loss: {val_loss / len(val_loader)}')

loss_fn = nn.L1Loss()
model = LSTMModel(input_dim, hidden_dim, output_dim, num_layers, dropout_prob=0.5).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=20)






Epoch 1, Loss: 6.261539459228516
Validation Loss: 5.9163501262664795
Epoch 2, Loss: 5.127919673919678
Validation Loss: 4.78740656375885
Epoch 3, Loss: 3.1129777431488037
Validation Loss: 2.8563780784606934
Epoch 4, Loss: 1.7321490049362183
Validation Loss: 1.62522491812706
Epoch 5, Loss: 1.2450954914093018
Validation Loss: 1.4066870212554932
Epoch 6, Loss: 1.1007578372955322
Validation Loss: 1.3488074094057083
Epoch 7, Loss: 1.0932317972183228
Validation Loss: 1.308042973279953
Epoch 8, Loss: 0.7590690851211548
Validation Loss: 1.2783263176679611
Epoch 9, Loss: 0.8192626237869263
Validation Loss: 1.233202651143074
Epoch 10, Loss: 1.2902768850326538
Validation Loss: 1.194618135690689
Epoch 11, Loss: 1.1738694906234741
Validation Loss: 1.1643218100070953
Epoch 12, Loss: 0.9942105412483215
Validation Loss: 1.1370963156223297
Epoch 13, Loss: 0.6886171698570251
Validation Loss: 1.1042059659957886
Epoch 14, Loss: 0.7228310704231262
Validation Loss: 1.0852984189987183
Epoch 15, Loss: 0.478076

In [86]:
# 테스트 단계
model.eval()
with torch.no_grad():
    test_loss = 0
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        test_loss += loss_fn(outputs.squeeze(), targets).item()
    print(f'Test Loss: {test_loss / len(test_loader)}')

Test Loss: 0.7546046674251556
