In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('../../data/train.csv',index_col='ID')
test_df = pd.read_csv('../../data/test.csv',index_col='ID')

train_df = train_df[['사고일시','요일', '기상상태', '도로형태', '노면상태', '사고유형','ECLO']]
test_df = test_df[['사고일시','요일', '기상상태', '도로형태', '노면상태', '사고유형']]

train_df['사고일시'] = pd.to_datetime(train_df['사고일시'], format='%Y-%m-%d %H', errors='raise')
train_df['시간'] = train_df['사고일시'].dt.hour
train_df['월'] = train_df['사고일시'].dt.month
test_df['사고일시'] = pd.to_datetime(test_df['사고일시'], format='%Y-%m-%d %H', errors='raise')
test_df['시간'] = test_df['사고일시'].dt.hour
test_df['월'] = test_df['사고일시'].dt.month
train_df.drop(columns='사고일시',inplace=True)
test_df.drop(columns='사고일시',inplace=True)

train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

X_trn = train_df.drop(columns='ECLO').astype(np.float32)
X_trn.drop(columns='기상상태_안개',inplace=True)
y_trn = train_df['ECLO'].astype(np.float32)

X_tst = test_df.astype(np.float32)

X_trn.shape, y_trn.shape, X_tst.shape

((39609, 34), (39609,), (10963, 34))

In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchmetrics
from typing import Optional, List
import numpy as np
import pandas as pd
  
from nn import ANN
from utils import CustomDataset
from torchmetrics import MeanAbsoluteError, MeanSquaredError, MeanSquaredLogError
from tqdm.auto import tqdm

def train_one_epoch(
    model: nn.Module,
    criterion: callable,
    optimizer: torch.optim.Optimizer,
    data_loader: DataLoader,
    device: str
) -> float:
    '''train one epoch

    Args:
        model: model
        criterion: loss
        optimizer: optimizer
        data_loader: data loader
        device: device
    '''
    model.train()
    total_loss = 0.
    for X, y in data_loader:
        X, y = X.to(device), y.to(device)
        output = model(X)
        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y)
    return total_loss / len(data_loader.dataset)

def evaluate(
    model: nn.Module,
    criterion: callable,
    data_loader: DataLoader,
    device: str,
    metric: Optional[torchmetrics.metric.Metric] = None,
    multi_metrics: List[torchmetrics.metric.Metric] = None
) -> float:
    '''evaluate

    Args:
        model: model
        criterions: list of criterion functions
        data_loader: data loader
        device: device
    '''
    model.eval()
    total_loss = 0.

    mae, mse, msle = (
        MeanAbsoluteError().to(device),
        MeanSquaredError().to(device),
        MeanSquaredLogError().to(device)
    )

    with torch.no_grad():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            total_loss += criterion(output, y).item() * len(y)

            if metric is not None:
                metric.update(output, y)

            if multi_metrics is not None:
                for metric in multi_metrics:
                    metric.update(output, y)

    if isinstance(total_loss, torch.Tensor):
        return total_loss.item() / len(data_loader.dataset)
    else:
        return total_loss / len(data_loader.dataset)

def kfold_cross_validation(model: nn.Module, criterion:callable, device:str, X_trn:np.array, y_trn:np.array, n_splits:int=5):
  from sklearn.model_selection import KFold
  from torchmetrics import MeanAbsoluteError, MeanSquaredError, MeanSquaredLogError
  # from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_squared_log_error
  from copy import deepcopy
  
  Kf = KFold(n_splits=n_splits, shuffle=True, random_state=2023)
  nets = [deepcopy(model) for i in range(n_splits)]
  scores = {
  'mae': [],
  'mse': [],
  'msle': []
  }
  for i, (trn_idx, val_idx) in enumerate(Kf.split(X_trn, y_trn)):
    X, y = (
        torch.tensor(X_trn.iloc[trn_idx].values).to(device),
        torch.tensor(y_trn.iloc[trn_idx].values).to(device)
    )
    X_val, y_val = (
        torch.tensor(X_trn.iloc[val_idx].values).to(device),
        torch.tensor(y_trn.iloc[val_idx].values).to(device)
    )
    ds = CustomDataset(X, y)
    ds_val = CustomDataset(X_val, y_val)
    dl = DataLoader(ds, batch_size=32, shuffle=True)
    dl_val = DataLoader(ds_val, batch_size=len(ds_val), shuffle=False)

    net = nets[i]()
    net.to(device)  # 모델을 디바이스로 이동

    pbar = tqdm(range(30))
    for j in pbar:
        mae, mse, msle = (
            MeanAbsoluteError().to(device),
            MeanSquaredError().to(device),
            MeanSquaredLogError().to(device)
        )
        criterion = nn.MSELoss(reduction='mean')
        optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
        loss = train_one_epoch(net, criterion, optimizer, dl, device)
        loss_val = evaluate(net, criterion, dl_val, device, multi_metrics=[mae, mse, msle])
        mae, mse, msle = mae.compute(), mse.compute(), msle.compute()
        pbar.set_postfix(trn_loss=loss, val_loss=loss_val)

    scores["mae"].append(mae.item())
    scores["mse"].append(mse.item())
    scores["msle"].append(msle.item())
  scores_df = pd.DataFrame(scores)
  scores_df = pd.concat([scores_df, scores_df.apply(['mean', 'std'])])
  print(scores_df)
  return scores

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'cuda'

In [4]:
scores = kfold_cross_validation(ANN, nn.MSELoss, device, X_trn, y_trn, n_splits = 5)
scores_df = pd.DataFrame(scores)
scores_df = pd.concat([scores_df, scores_df.apply(['mean', 'std'])])

100%|██████████| 30/30 [01:10<00:00,  2.36s/it, trn_loss=9.88, val_loss=10.6]
100%|██████████| 30/30 [01:10<00:00,  2.34s/it, trn_loss=10.2, val_loss=9.11]
100%|██████████| 30/30 [01:10<00:00,  2.35s/it, trn_loss=9.9, val_loss=10.4] 
100%|██████████| 30/30 [01:09<00:00,  2.33s/it, trn_loss=9.99, val_loss=10.2]
100%|██████████| 30/30 [01:09<00:00,  2.31s/it, trn_loss=10.1, val_loss=9.82]

           mae        mse      msle
0     2.185246  10.572868  0.213920
1     2.126809   9.112566  0.210314
2     2.128574  10.439912  0.209654
3     2.165910  10.153998  0.214322
4     2.122691   9.823783  0.211252
mean  2.145846  10.020625  0.211892
std   0.028070   0.583498  0.002117





In [6]:
scores_df

Unnamed: 0,mae,mse,msle
0,2.185246,10.572868,0.21392
1,2.126809,9.112566,0.210314
2,2.128574,10.439912,0.209654
3,2.16591,10.153998,0.214322
4,2.122691,9.823783,0.211252
mean,2.145846,10.020625,0.211892
std,0.02807,0.583498,0.002117
