### TitanicDataset 클래스

In [55]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split

### 요구사항 2 ####
import argparse  # argparse 모듈을 임포트

# 나머지 라이브러리 임포트
import wandb
from datetime import datetime
from torch import nn, optim


class TitanicDataset(Dataset):
    def __init__(self, X, y):
        # 입력 데이터 X와 레이블 y를 PyTorch 텐서로 변환
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        # 데이터셋의 크기를 반환하는 메서드
        return len(self.X)

    def __getitem__(self, idx):
        # 인덱스 idx에 해당하는 데이터를 튜플로 반환
        feature = self.X[idx]
        target = self.y[idx]
        return feature, target  # 튜플 형태로 반환

    def __str__(self):
        # 데이터셋 정보를 문자열로 반환
        return "Data Size: {0}, Input Shape: {1}, Target Shape: {2}".format(
            len(self.X), self.X.shape, self.y.shape
        )


### TitanicTestDataset 클래스

In [56]:
# TitanicTestDataset 클래스는 테스트 데이터를 위한 Dataset 클래스
class TitanicTestDataset(Dataset):
    def __init__(self, X):
        # 입력 데이터 X를 PyTorch 텐서로 변환
        self.X = torch.FloatTensor(X)

    def __len__(self):
        # 데이터셋의 크기를 반환
        return len(self.X)

    def __getitem__(self, idx):
        # 인덱스 idx에 해당하는 데이터를 반환
        feature = self.X[idx]
        return {'input': feature}

    def __str__(self):
        # 데이터셋 정보를 문자열로 반환
        return "Data Size: {0}, Input Shape: {1}".format(
            len(self.X), self.X.shape
        )

In [57]:
# 데이터셋을 전처리하고 결합하는 함수
def get_preprocessed_dataset():
    # 현재 파일 경로를 기준으로 CSV 파일 경로 설정
    CURRENT_FILE_PATH = os.getcwd()
    train_data_path = os.path.join(CURRENT_FILE_PATH, "train.csv")
    test_data_path = os.path.join(CURRENT_FILE_PATH, "test.csv")

    # CSV 파일을 데이터프레임으로 로드
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    # 훈련 데이터와 테스트 데이터를 결합하여 전체 데이터셋 생성
    all_df = pd.concat([train_df, test_df], sort=False)

    # 데이터 전처리 단계를 차례대로 적용
    all_df = get_preprocessed_dataset_1(all_df)
    all_df = get_preprocessed_dataset_2(all_df)
    all_df = get_preprocessed_dataset_3(all_df)
    all_df = get_preprocessed_dataset_4(all_df)
    all_df = get_preprocessed_dataset_5(all_df)
    all_df = get_preprocessed_dataset_6(all_df)

    # 훈련 및 테스트 데이터 분리
    train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
    train_y = train_df["Survived"]
    test_X = all_df[all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)

    # Dataset 생성
    dataset = TitanicDataset(train_X.values, train_y.values)
    train_dataset, validation_dataset = random_split(dataset, [0.8, 0.2])
    test_dataset = TitanicTestDataset(test_X.values)

    return train_dataset, validation_dataset, test_dataset


### get_preprocessed_dataset_1

In [58]:
# 전처리 단계 1: Pclass별 평균 Fare 값으로 결측치 메우기
def get_preprocessed_dataset_1(all_df):
    Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
    Fare_mean.columns = ["Pclass", "Fare_mean"]
    all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
    all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]

    return all_df


### get_preprocessed_dataset_2

In [59]:
# 전처리 단계 2: 이름(Name)을 세 개의 컬럼으로 분리
def get_preprocessed_dataset_2(all_df):
    name_df = all_df["Name"].str.split("[,.]", n=2, expand=True)
    name_df.columns = ["family_name", "honorific", "name"]
    name_df["family_name"] = name_df["family_name"].str.strip()
    name_df["honorific"] = name_df["honorific"].str.strip()
    name_df["name"] = name_df["name"].str.strip()
    all_df = pd.concat([all_df, name_df], axis=1)

    return all_df


### get_preprocessed_dataset_3

In [60]:
# 전처리 단계 3: honorific별 Age 결측치 메우기
def get_preprocessed_dataset_3(all_df):
    honorific_age_mean = all_df[["honorific", "Age"]].groupby("honorific").median().round().reset_index()
    honorific_age_mean.columns = ["honorific", "honorific_age_mean"]
    all_df = pd.merge(all_df, honorific_age_mean, on="honorific", how="left")
    all_df.loc[(all_df["Age"].isnull()), "Age"] = all_df["honorific_age_mean"]
    all_df = all_df.drop(["honorific_age_mean"], axis=1)

    return all_df


### get_preprocessed_dataset_4

In [61]:
# 전처리 단계 4: 가족수 및 혼자 탑승 여부 파생 변수 추가
def get_preprocessed_dataset_4(all_df):
    all_df["family_num"] = all_df["Parch"] + all_df["SibSp"]
    all_df.loc[all_df["family_num"] == 0, "alone"] = 1
    all_df["alone"].fillna(0, inplace=True)

    # 불필요한 컬럼 제거
    all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)

    return all_df


### get_preprocessed_dataset_5

In [62]:
# 전처리 단계 5: honorific 개수를 줄이고, Embarked 결측치 메우기
def get_preprocessed_dataset_5(all_df):
    all_df.loc[~((all_df["honorific"] == "Mr") | 
                 (all_df["honorific"] == "Miss") | 
                 (all_df["honorific"] == "Mrs") | 
                 (all_df["honorific"] == "Master")), "honorific"] = "other"
    all_df["Embarked"].fillna("missing", inplace=True)

    return all_df


### get_preprocessed_dataset_6

In [63]:
# 전처리 단계 6: 카테고리형 변수 레이블 인코딩
def get_preprocessed_dataset_6(all_df):
    category_features = all_df.columns[all_df.dtypes == "object"]
    from sklearn.preprocessing import LabelEncoder
    for category_feature in category_features:
        le = LabelEncoder()
        if all_df[category_feature].dtypes == "object":
            le = le.fit(all_df[category_feature])
            all_df[category_feature] = le.transform(all_df[category_feature])

    return all_df


### 데이터 로딩 함수(요구사항2)

In [64]:
def get_data():
    # Titanic 데이터셋 로드
    train_dataset, validation_dataset, _ = get_preprocessed_dataset()  # 반환된 세 개의 값 중 세 번째는 사용하지 않음

    # Wandb 설정에 따라 배치 크기 지정
    train_data_loader = DataLoader(dataset=train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
    validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=len(validation_dataset))

    return train_data_loader, validation_data_loader


### MyModelMyModel

In [65]:
# from torch import nn

# # MyModel 클래스는 PyTorch의 nn.Module을 상속받아 모델을 정의합니다.
# class MyModel(nn.Module):
#     def __init__(self, n_input, n_output):
#         super().__init__()

#         # 모델은 3개의 선형 계층과 활성화 함수 ReLU로 구성
#         self.model = nn.Sequential(
#             nn.Linear(n_input, 30),
#             nn.ReLU(),
#             nn.Linear(30, 30),
#             nn.ReLU(),
#             nn.Linear(30, n_output),
#         )

#     def forward(self, x):
#         # 입력 데이터를 모델에 전달하여 예측 결과를 반환
#         return self.model(x)


### 모델 수정(요구사항 2)

In [66]:
class MyModel(nn.Module):
    def __init__(self, n_input, n_output, activation_fn):
        super().__init__()

        if activation_fn == "ReLU":
            activation = nn.ReLU()
        elif activation_fn == "ELU":
            activation = nn.ELU()
        elif activation_fn == "LeakyReLU":
            activation = nn.LeakyReLU()
        elif activation_fn == "PReLU":
            activation = nn.PReLU()
        else:
            raise ValueError(f"Unsupported activation function: {activation_fn}")

        self.model = nn.Sequential(
            nn.Linear(n_input, wandb.config.n_hidden_unit_list[0]),
            activation,
            nn.Linear(wandb.config.n_hidden_unit_list[0], wandb.config.n_hidden_unit_list[1]),
            activation,
            nn.Linear(wandb.config.n_hidden_unit_list[1], n_output),
        )

    def forward(self, x):
        return self.model(x)



### 테스트 함수

In [67]:
import torch

# 테스트 데이터를 사용해 모델을 평가하는 함수
def test(test_data_loader):
    print("[TEST]")
    batch = next(iter(test_data_loader))
    print("{0}".format(batch['input'].shape))
    my_model = MyModel(n_input=11, n_output=2)
    my_model.eval()

    with torch.no_grad():
        output_batch = my_model(batch['input'])
        prediction_batch = torch.argmax(output_batch, dim=1)

        # 예측 결과 출력
        for idx, prediction in enumerate(prediction_batch, start=892):
            print(idx, prediction.item())


### 모델 및 Optimizer 설정(요구사항 2)

In [68]:
def get_model_and_optimizer():
    activation_fn = wandb.config.activation_fn  # 설정된 Activation Function을 가져옴
    my_model = MyModel(n_input=11, n_output=2, activation_fn=activation_fn)
    optimizer = optim.SGD(my_model.parameters(), lr=wandb.config.learning_rate)

    return my_model, optimizer



### 훈련 루프(요구사항 2)

In [69]:
def training_loop(model, optimizer, train_data_loader, validation_data_loader):
    n_epochs = wandb.config.epochs
    loss_fn = nn.CrossEntropyLoss()  # Titanic 데이터는 분류 문제이므로 CrossEntropyLoss 사용
    next_print_epoch = 100

    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        num_trains = 0
        for input, target in train_data_loader:
            output_train = model(input)
            loss = loss_fn(output_train, target)
            loss_train += loss.item()
            num_trains += 1

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        loss_validation = 0.0
        num_validations = 0
        with torch.no_grad():
            for input, target in validation_data_loader:
                output_validation = model(input)
                loss = loss_fn(output_validation, target)
                loss_validation += loss.item()
                num_validations += 1

        # Wandb에 훈련 로그 기록
        wandb.log({
            "Epoch": epoch,
            "Training loss": loss_train / num_trains,
            "Validation loss": loss_validation / num_validations
        })

        if epoch >= next_print_epoch:
            print(
                f"Epoch {epoch}, "
                f"Training loss {loss_train / num_trains:.4f}, "
                f"Validation loss {loss_validation / num_validations:.4f}"
            )
            next_print_epoch += 100


### 각 Activation Function에 대해 학습을 실행할 함수 정의

In [70]:
def run_experiments_with_multiple_activations():
    # 테스트할 Activation Function 목록
    activation_functions = ["ReLU", "ELU", "LeakyReLU", "PReLU"]

    # argparse를 한 번 설정하여 인자를 읽어옴
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--wandb", action=argparse.BooleanOptionalAction, default=False, help="True or False"
    )
    parser.add_argument(
        "-b", "--batch_size", type=int, default=16, help="Batch size (int, default: 16)"
    )
    parser.add_argument(
        "-e", "--epochs", type=int, default=1000, help="Number of training epochs (int, default: 1000)"
    )
    parser.add_argument(
        "-a", "--activation_fn", type=str, default="ReLU",
        choices=["ReLU", "ELU", "LeakyReLU", "PReLU"],
        help="Activation function to use (default: ReLU)"
    )

    # 인자 파싱 (args=[]로 설정하여 Jupyter Notebook에서도 작동하도록)
    args = parser.parse_args(args=[])

    # 모든 Activation Function에 대해 반복적으로 학습 수행
    for activation_fn in activation_functions:
        print(f"\nRunning experiment with Activation Function: {activation_fn}\n")
        # 현재 Activation Function을 설정에 반영
        args.activation_fn = activation_fn
        main(args)
        print(f"\nExperiment with {activation_fn} completed!\n")

### Main 함수(요구사항 2)

In [71]:
# def main(args):
#     current_time_str = datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S')

#     # Wandb 설정에서 'batch_size'를 포함했는지 확인
#     config = {
#         'epochs': args.epochs,
#         'batch_size': args.batch_size,
#         'learning_rate': 1e-3,
#         'n_hidden_unit_list': [20, 20],
#         'activation_fn': args.activation_fn  # Activation Function 선택
#     }

#     # Wandb 초기화
#     wandb_run = wandb.init(
#         entity="seongjae6751",
#         mode="online" if args.wandb else "disabled",
#         project="titanic_model_training",
#         notes="Titanic data training experiment",
#         tags=["titanic", "activation_function"],
#         name=current_time_str,
#         config=config  # config 객체를 그대로 사용
#     )
    
#     print(f"Wandb URL: {wandb_run.url}")

#     # Wandb 객체의 초기화 상태 확인
#     if wandb_run is not None:
#         print(f"Wandb Run 초기화 성공: {wandb_run.id}")
#         print(f"Wandb Mode: {wandb_run.settings.mode}")
#         print(f"Wandb URL: {wandb_run.url}")
#     else:
#         print("Wandb 초기화에 실패했습니다.")

#     run_url = wandb.run.url
#     print(f"Wandb URL for {args.activation_fn}: {run_url}")
    
#     train_data_loader, validation_data_loader = get_data()
#     model, optimizer = get_model_and_optimizer()

#     training_loop(
#         model=model,
#         optimizer=optimizer,
#         train_data_loader=train_data_loader,
#         validation_data_loader=validation_data_loader
#     )
#     wandb.finish()


### Main 실행(요구사항2)

In [72]:
# if __name__ == "__main__":
#     # 데이터셋 전처리 및 로드
#     train_dataset, validation_dataset, test_dataset = get_preprocessed_dataset()

#     # 데이터셋 정보 출력
#     print("train_dataset: {0}, validation_dataset.shape: {1}, test_dataset: {2}".format(
#         len(train_dataset), len(validation_dataset), len(test_dataset)
#     ))
#     print("#" * 50, 1)

#     # DataLoader 생성
#     train_data_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
#     validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=16, shuffle=True)
#     test_data_loader = DataLoader(dataset=test_dataset, batch_size=len(test_dataset))

#     # 학습 데이터 출력
#     print("[TRAIN]")
#     for idx, batch in enumerate(train_data_loader):
#         print("{0} - {1}: {2}".format(idx, batch['input'].shape, batch['target'].shape))

#     # 검증 데이터 출력
#     print("[VALIDATION]")
#     for idx, batch in enumerate(validation_data_loader):
#         print("{0} - {1}: {2}".format(idx, batch['input'].shape, batch['target'].shape))

#     # 테스트 데이터 예측
#     print("#" * 50, 3)
#     test(test_data_loader)



if __name__ == "__main__":
    # 모든 Activation Function에 대해 실험 수행
    run_experiments_with_multiple_activations()


Running experiment with Activation Function: ReLU

Wandb URL: 
Wandb Run 초기화 성공: t676c3b6
Wandb Mode: 
Wandb URL: 
Wandb URL for ReLU: 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["alone"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["Embarked"].fillna("missing", inplace=True)


Epoch 100, Training loss 0.5847, Validation loss 0.5395
Epoch 200, Training loss 0.5684, Validation loss 0.5494
Epoch 300, Training loss 0.5371, Validation loss 0.5031
Epoch 400, Training loss 0.5207, Validation loss 0.4881
Epoch 500, Training loss 0.5104, Validation loss 0.5109
Epoch 600, Training loss 0.4686, Validation loss 0.5633
Epoch 700, Training loss 0.5190, Validation loss 0.5568
Epoch 800, Training loss 0.4425, Validation loss 0.4095
Epoch 900, Training loss 0.4264, Validation loss 0.4419
Epoch 1000, Training loss 0.4334, Validation loss 0.4823

Experiment with ReLU completed!


Running experiment with Activation Function: ELU

Wandb URL: 
Wandb Run 초기화 성공: mwh53nc5
Wandb Mode: 
Wandb URL: 
Wandb URL for ELU: 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["alone"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["Embarked"].fillna("missing", inplace=True)


Epoch 100, Training loss 0.5612, Validation loss 0.6053
Epoch 200, Training loss 0.5421, Validation loss 0.5771
Epoch 300, Training loss 0.5176, Validation loss 0.6229
Epoch 400, Training loss 0.5059, Validation loss 0.5248
Epoch 500, Training loss 0.4609, Validation loss 0.5084
Epoch 600, Training loss 0.4416, Validation loss 0.5519
Epoch 700, Training loss 0.4434, Validation loss 0.5132
Epoch 800, Training loss 0.4192, Validation loss 0.5258
Epoch 900, Training loss 0.4139, Validation loss 0.6699
Epoch 1000, Training loss 0.4248, Validation loss 0.5694

Experiment with ELU completed!


Running experiment with Activation Function: LeakyReLU

Wandb URL: 
Wandb Run 초기화 성공: 4ekfeytp
Wandb Mode: 
Wandb URL: 
Wandb URL for LeakyReLU: 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["alone"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["Embarked"].fillna("missing", inplace=True)


Epoch 100, Training loss 0.5743, Validation loss 0.5733
Epoch 200, Training loss 0.5588, Validation loss 0.5814
Epoch 300, Training loss 0.5380, Validation loss 0.5430
Epoch 400, Training loss 0.5108, Validation loss 0.4940
Epoch 500, Training loss 0.4728, Validation loss 0.4829
Epoch 600, Training loss 0.4805, Validation loss 0.4372
Epoch 700, Training loss 0.4620, Validation loss 0.4353
Epoch 800, Training loss 0.4413, Validation loss 0.4194
Epoch 900, Training loss 0.4455, Validation loss 0.4803
Epoch 1000, Training loss 0.4473, Validation loss 0.4705

Experiment with LeakyReLU completed!


Running experiment with Activation Function: PReLU

Wandb URL: 
Wandb Run 초기화 성공: 32v6zxrh
Wandb Mode: 
Wandb URL: 
Wandb URL for PReLU: 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["alone"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["Embarked"].fillna("missing", inplace=True)


Epoch 100, Training loss 0.5775, Validation loss 0.5841
Epoch 200, Training loss 0.5717, Validation loss 0.5807
Epoch 300, Training loss 0.5581, Validation loss 0.5750
Epoch 400, Training loss 0.5344, Validation loss 0.5622
Epoch 500, Training loss 0.4973, Validation loss 0.5279
Epoch 600, Training loss 0.4830, Validation loss 0.8545
Epoch 700, Training loss 0.4406, Validation loss 0.4753
Epoch 800, Training loss 0.4242, Validation loss 0.4583
Epoch 900, Training loss 0.4231, Validation loss 0.4490
Epoch 1000, Training loss 0.4247, Validation loss 0.5423

Experiment with PReLU completed!

