In [4]:
import pandas as pd
df_train = pd.read_csv('../../main/data/origin/train.csv')
df_train['군구'] = df_train['시군구'].replace('대구광역시 ','',regex=True)
df_train['사고일시'] = pd.to_datetime(df_train['사고일시'])
df_train['월'] = df_train['사고일시'].dt.month
df_train['시간'] = df_train['사고일시'].dt.hour
df_train.drop(columns=['시군구'], inplace=True)
df_train.drop(columns=['ID','사고유형 - 세부분류', '법규위반', '가해운전자 차종', '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도',
       '피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도','사고일시','ECLO'],inplace=True)

In [5]:
df_train.columns

Index(['요일', '기상상태', '도로형태', '노면상태', '사고유형', '사망자수', '중상자수', '경상자수', '부상자수',
       '군구', '월', '시간'],
      dtype='object')

In [5]:
df_train = pd.get_dummies(df_train, columns = ['요일', '기상상태', '도로형태', '노면상태', '사고유형'])

In [7]:
X_trn = df_train.drop(columns=['사망자수', '중상자수', '경상자수', '부상자수'])
y_trn = df_train[['사망자수', '중상자수', '경상자수', '부상자수']]

In [8]:
X_trn.shape, y_trn.shape

((39609, 36), (39609,))

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'cuda'

In [11]:
from tqdm.auto import tqdm
from typing import Optional, List
import torchmetrics
import numpy as np
from torch.utils.data import Dataset
from sklearn.metrics import mean_squared_error as mse

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
class CustomDataset(Dataset):
  def __init__(self, *args:list[np.array]):
    assert all(args[0].shape[0] == arg.shape[0] for arg in args), "Size mismatch."
    self.data = args
  def __getitem__(self, index):
    return tuple(x[index] for x in self.data)
  def __len__(self):
    return self.data[0].shape[0]

In [16]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import pandas as pd

activation_list = {"sigmoid": nn.Sigmoid(), "relu": nn.ReLU(), "tanh": nn.Tanh(), "prelu": nn.PReLU()}

class ANN(nn.Module):
  def __init__(self, input_dim: int=5, hidden_dim: list=[128, 128, 64, 32], activation: str="sigmoid", use_drop:bool = True, drop_ratio: float=0.3):
    super().__init__()
    # self.embedding = nn.Linear(1,10)
    dims = [input_dim] + hidden_dim 
    self.Identity = nn.Identity()
    self.dropout = nn.Dropout(drop_ratio)
    self.activation = activation_list[activation]
    
    model = [[nn.Linear(dims[i], dims[i+1]), self.dropout if use_drop else self.Identity, self.activation] for i in range(len(dims) - 1)]
    output_layer = [nn.Linear(dims[-1], 4), nn.Identity()] # 1 -> 4 하고 1 -> 사망자, data unvalance가 있어도 괜찮  mse를 활용하며 가중치를 사망자, 경상자 마다 바꾸고 학습
    # output_layer = [nn.Linear(dims[-1], 1), nn.Identity()] # Relu는 항상 양수 값일 때 주는 방법
    self.module_list= nn.ModuleList(sum(model, []) + output_layer)
  def forward(self, x):
    # x = torch.concat([x[:,:4], self.embedding(x[:,4:5]),x[:,5:]])
    for layer in self.module_list:
         x = layer(x) # 차원 맞춰서 갯수증가 # 다른방법: linear layer하나 추가하고 다른 피쳐들 가중치를 0 으로 학습 진행 x
    return x

In [20]:

from torch.nn import MSELoss
def train_one_epoch(
  model:nn.Module,
  criterion:callable,
  optimizer:torch.optim.Optimizer,
  data_loader:DataLoader,
  device:str
) -> float:
  '''train one epoch

  Args:
      model: model
      criterion: loss
      optimizer: optimizer
      data_loader: data loader
      device: device
  '''
  model.train()
  total_loss = 0.
  for X, y in data_loader:
    X, y = X.to(device), y.to(device)
    output = model(X)
    mse = MSELoss()
    loss = mse(output[:,0],y[:,0]) * 10 + mse(output[:,1],y[:,1]) * 5 + mse(output[:,2],y[:,2]) * 3 + mse(output[:,3],y[:,3])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item() * len(y)
  return total_loss/len(data_loader.dataset)

def evaluate(
  model: nn.Module,
  criterion: callable,
  data_loader: DataLoader,
  device: str,
  metric: Optional[torchmetrics.metric.Metric] = None,
  multi_metrics: List[torchmetrics.metric.Metric] = None
) -> float:
  '''evaluate

  Args:
      model: model
      criterions: list of criterion functions
      data_loader: data loader
      device: device
  '''
  model.eval()
  total_loss = 0.
  with torch.no_grad():  # no_grad context로 감싸서 그라디언트 계산 비활성화
    for X, y in data_loader:
      X, y = X.to(device), y.to(device)
      output = model(X)
      loss = mse(output[:,0],y[:,0]) * 10 + mse(output[:,1],y[:,1]) * 5 + mse(output[:,2],y[:,2]) * 3 + mse(output[:,3],y[:,3])
      total_loss += loss.item() * len(y)

      if metric is not None:
        # For a single metric, update it with the total output and total target
        metric.update(output, y)

      if multi_metrics is not None:
        # For multiple metrics, update each metric with the current output and target
        for metric in multi_metrics:
          metric.update(output, y)

    # 평가 지표 계산 시에는 스칼라인 경우에 대한 처리 추가
    if isinstance(total_loss, torch.Tensor): # isinstance(x,y) x의 타입이 y가 맞는지 T,F
        return total_loss.item() / len(data_loader.dataset)
    else:
        return total_loss / len(data_loader.dataset)


def kfold_cross_validation(model: nn.Module, criterion:callable, device:str, X_trn:np.array, y_trn:np.array, n_splits:int=5):
  from sklearn.model_selection import KFold
  from torchmetrics import MeanAbsoluteError, MeanSquaredError, MeanSquaredLogError
  # from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_squared_log_error
  from copy import deepcopy
  
  Kf = KFold(n_splits=n_splits, shuffle=True, random_state=2023)
  nets = [deepcopy(model) for i in range(n_splits)]
  scores = {
  'mae': [],
  'mse': [],
  'msle': []
  }
  
  for i, (trn_idx, val_idx) in enumerate(Kf.split(X_trn, y_trn)):
    X, y = torch.tensor(X_trn.iloc[trn_idx].values.astype(np.float32)), torch.tensor(y_trn.iloc[trn_idx].values.astype(np.float32))
    X_val, y_val = torch.tensor(X_trn.iloc[val_idx].values.astype(np.float32)), torch.tensor(y_trn.iloc[val_idx].values.astype(np.float32))

    ds = CustomDataset(X, y)
    ds_val = CustomDataset(X_val, y_val)
    dl = DataLoader(ds, batch_size=64, shuffle=True)
    dl_val = DataLoader(ds_val, batch_size=len(ds_val), shuffle=False)

    net = nets[i].train()

    pbar = tqdm(range(30))
    for j in pbar:
      mae, mse, msle = MeanAbsoluteError().to(device), MeanSquaredError().to(device), MeanSquaredLogError().to(device)
      optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
      loss = train_one_epoch(net, criterion, optimizer, dl, device)
      loss_val = evaluate(net, criterion, dl_val, device, multi_metrics=[mae, mse, msle])
      mae, mse, msle = mae.compute(), mse.compute(), msle.compute()
      # Get predictions for validation set
      # y_val_pred = net(X_val).detach().numpy()
      pbar.set_postfix(trn_loss=loss, val_loss=loss_val)  # 진행바 우측에 진행상황 표시
    scores["mae"].append(mae.item())
    scores["mse"].append(mse.item())
    scores["msle"].append(msle.item())

  return scores

In [19]:
device = torch.device(device)
X_trn = df_train.drop(columns=['사망자수', '중상자수', '경상자수', '부상자수'])
y_trn = df_train[['사망자수', '중상자수', '경상자수', '부상자수']]

ds = CustomDataset(X_trn, y_trn)
dl = DataLoader(ds, batch_size=64, shuffle=True)

model = ANN(X_trn.shape[-1], hidden_dim=[64,64], drop_ratio = 0.3).to(device)
print(model)
loss_func = nn.functional.mse_loss

scores = kfold_cross_validation(model, loss_func, device, X_trn, y_trn)
# scores_df = pd.DataFrame(scores)
# scores_df = pd.concat([scores_df, scores_df.apply(['mean', 'std'])])
# scores_df.to_csv("scores.csv", index=False)
# print(scores_df)


KeyError: "['사망자수', '중상자수', '경상자수', '부상자수'] not found in axis"

In [39]:
from sklearn.model_selection import KFold
Kf = KFold(n_splits=5, shuffle=True, random_state=2023)
for i, (trn_idx, val_idx) in enumerate(Kf.split(X_trn, y_trn)):
    X, y = torch.tensor(X_trn.iloc[trn_idx].values.astype(np.float32)), torch.tensor(y_trn.iloc[trn_idx].values.astype(np.float32))
    X_val, y_val = torch.tensor(X_trn.iloc[val_idx].values.astype(np.float32)), torch.tensor(y_trn.iloc[val_idx].values.astype(np.float32))

    ds = CustomDataset(X, y)
    ds_val = CustomDataset(X_val, y_val)
    dl = DataLoader(ds, batch_size=64, shuffle=True)
    dl_val = DataLoader(ds_val, batch_size=len(ds_val), shuffle=False)


    # model = ANN(X_trn.shape[-1], hidden_dim=64, activation='relu', use_dropout=True, drop_ratio = 0.3).to(device)


    # 최종학습
    optimizer = torch.optim.Adam(model.parameters(), lr=.00001)
    pbar = range(30)
    if True:
        pbar = tqdm(pbar)
    for _ in pbar:
        loss = train_one_epoch(model, loss_func, optimizer, dl, device)
    pbar.set_postfix(trn_loss=loss)

    # save pretrained weight
    torch.save(model.state_dict(), "./model.pth")
    # final outuput with testset
    model = ANN(input_dim=X_trn.shape[-1], hidden_dim=[64,64], activation='relu').to(device)
    model.load_state_dict(torch.load("model.pth"))


    result = []
    with torch.inference_mode():
        for X in dl_val:
            X = X[0].to(device)
            output = model(X).squeeze().tolist()
            result.extend(output)

    test_id = y_trn.iloc[val_idx].index.tolist()
    result = pd.DataFrame(result)
    result['ID'] = test_id
    print()
    result.rename(columns={'ID': 'ID',
                           '1': '사망자수',
                           '2': '중상자수', 
                           '3': '경상자수', 
                           '4': '부상자수'},inplace=True)
    result.to_csv("result.csv", index=False)
    df['ECLO'] = (df['사망자수'] * 10) + (df['중상자수'] * 5) + (df['경상자수'] * 3) + (df['부상자수'])

100%|██████████| 30/30 [00:29<00:00,  1.02it/s]





100%|██████████| 30/30 [00:30<00:00,  1.03s/it]





  7%|▋         | 2/30 [00:02<00:39,  1.41s/it]


KeyboardInterrupt: 

<bound method Series.mean of 0       4.739994
1       4.618116
2       4.293276
3       3.858552
4       4.599720
          ...   
7917    4.244061
7918    4.718004
7919    4.738001
7920    4.955145
7921    4.645133
Name: ECLO, Length: 7922, dtype: float64>