In [10]:
import pandas as pd
import numpy as np
import csv

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from glob import glob
from tqdm.cli import tqdm
import re, os, random

from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as v2
import timm

from PIL import Image

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [11]:
def seed_everything(seed=42):
    random.seed(seed)  # Python 내장 random 모듈
    os.environ['PYTHONHASHSEED'] = str(seed)  # 환경변수 설정
    np.random.seed(seed)  # NumPy
    torch.manual_seed(seed)  # PyTorch CPU 시드 고정
    torch.cuda.manual_seed(seed)  # PyTorch GPU 시드 고정
    torch.cuda.manual_seed_all(seed)  # 멀티 GPU 환경에서도 시드 고정
    torch.backends.cudnn.deterministic = True  # CuDNN 관련 설정
    torch.backends.cudnn.benchmark = False  # 동일한 입력 크기의 데이터가 반복될 경우 속도 향상을 위한 벤치마크 모드 비활성화

# 사용 예시
seed_everything(seed=42)

이미지 경로 포함

### 데이터 mean, std 계산

In [12]:
import numpy as np
import albumentations as A
from albumentations.core.transforms_interface import ImageOnlyTransform
from albumentations.pytorch import ToTensorV2

# train기준 mean/std # leakage 아님.
mean = 15.359733188267395
std = 58.28444875927197

test_transforms = A.Compose([
    A.Resize(always_apply = True, p=1.0, height=224, width=224),
    A.Normalize(mean=[mean, mean, mean], std=[std, std, std], p=1.0),
    ToTensorV2()
])

In [13]:
test_df = pd.read_csv('datasets/answer_sample.csv')
test_df['data_path'] = test_df.apply(lambda row: f"datasets/image_datasets/user{row['subject_id']}_{row['date']}_test.png", axis=1)

In [14]:
class CustomDataset(Dataset):
    def __init__(self, df, transforms):
        self.path = df['data_path'].values
        self.transform = transforms

    def __getitem__(self, idx):
        try:
            img = np.array(Image.open(self.path[idx]).convert('RGB'))
        except FileNotFoundError:
            # If file not found, skip to the next item
            return self.__getitem__((idx + 1) % len(self))
        
        img = self.transform(image=img)
        img = img["image"]
        
        return img

    def __len__(self):
        return len(self.path)


In [15]:
test_dataset = CustomDataset(test_df, test_transforms)

In [16]:
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [17]:
models = []
for fold in ['model_fold_1_f1-0.679_loss-0.588_seresnext101_32x4d']: # 5.88 change q3 --> 6.28
    model = timm.create_model('seresnext101_32x4d', pretrained=False, num_classes=7)
    checkpoint = torch.load(f'models/{fold}.pt')
    model.load_state_dict(checkpoint)
    model = model.to('cuda')
    model.eval()
    models.append(model)

def predict_ensemble(models, test_loader, device):
    model_preds = []
    with torch.no_grad():
        for data in tqdm(iter(test_loader)):
            data = data.to(device)
            fold_preds = []
            for model in models:
                pred = model(data)  # Get raw model predictions
                pred = torch.sigmoid(pred)
                fold_preds.append(pred.cpu().numpy())
            # 평균을 통해 soft voting
            fold_preds = np.mean(fold_preds, axis=0)
            model_preds.extend(fold_preds)
    return model_preds

preds = np.array(predict_ensemble(models, test_loader, 'cuda'))
predictions_df = pd.DataFrame(preds, columns=['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4'])

averages = {
  'Q1': 0.5,
  'Q2': 0.5,
  'Q3': 0.5,
  'S1': 0.5,
  'S2': 0.5,
  'S3': 0.5,
  'S4': 0.5
}

def binarize_predictions(predictions_df, averages):
    binary_predictions = predictions_df.copy()
    for column in predictions_df.columns:
        if column == 'Q3':
            binary_predictions[column] = (predictions_df[column] <= averages[column]).astype(int)
        else:
            binary_predictions[column] = (predictions_df[column] > averages[column]).astype(int)
    return binary_predictions


binary_preds = binarize_predictions(predictions_df , averages)

predictions_df = pd.DataFrame(binary_preds, columns=['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4'])

final_df = pd.concat([test_df[['subject_id', 'date']], predictions_df], axis=1)
final_df.to_csv(f"result/seresnext101_32x4d.csv", index=False)

100% 15/15 [00:02<00:00,  5.31it/s]


In [18]:
pd.set_option('display.max_rows', None)
final_df

Unnamed: 0,subject_id,date,Q1,Q2,Q3,S1,S2,S3,S4
0,5,2023-11-05,0,1,1,1,0,0,0
1,5,2023-11-06,1,0,1,1,0,1,0
2,5,2023-11-07,1,0,1,0,0,1,0
3,5,2023-11-08,0,1,1,1,0,0,0
4,5,2023-11-09,0,0,1,0,0,0,0
5,5,2023-11-10,1,1,1,1,0,0,0
6,5,2023-11-11,0,0,1,0,0,1,0
7,5,2023-11-12,0,1,1,1,0,0,0
8,5,2023-11-13,1,1,1,1,0,0,0
9,5,2023-11-14,1,1,1,1,0,1,0
