In [12]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 데이터 전처리 함수 정의
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # Age binding (나이 구간화)
    df.loc[df['Age'] <= 10, 'Age_clean'] = 0
    df.loc[(df['Age'] > 10) & (df['Age'] <= 16), 'Age_clean'] = 1
    df.loc[(df['Age'] > 16) & (df['Age'] <= 20), 'Age_clean'] = 2
    df.loc[(df['Age'] > 20) & (df['Age'] <= 26), 'Age_clean'] = 3
    df.loc[(df['Age'] > 26) & (df['Age'] <= 30), 'Age_clean'] = 4
    df.loc[(df['Age'] > 30) & (df['Age'] <= 36), 'Age_clean'] = 5
    df.loc[(df['Age'] > 36) & (df['Age'] <= 40), 'Age_clean'] = 6
    df.loc[(df['Age'] > 40) & (df['Age'] <= 46), 'Age_clean'] = 7
    df.loc[(df['Age'] > 46) & (df['Age'] <= 50), 'Age_clean'] = 8
    df.loc[(df['Age'] > 50) & (df['Age'] <= 60), 'Age_clean'] = 9
    df.loc[df['Age'] > 60, 'Age_clean'] = 10

    # 가족 크기 계산
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # 이름에서 직함 추출
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

    # 직함 그룹화
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir.', 'Jonkheer', 'Dona']
    df.loc[df['Title'].isin(rare_titles), 'Title'] = 'Rare'
    df['Title'] = df['Title'].replace(['Mlle', 'Ms', 'Mme'], ['Miss', 'Miss', 'Mrs'])

    # 직함 매핑
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)

    # Cabin 데이터 전처리
    cabin_mapping = {
        'A': 0, 'B': 1, 'C': 2, 'D': 3,
        'E': 4, 'F': 5, 'G': 6, 'T': 7
    }
    df['Cabin_clean'] = df['Cabin'].str[:1]
    df['Cabin_clean'] = df['Cabin_clean'].map(cabin_mapping)
    df['Cabin_clean'] = df.groupby('Pclass')['Cabin_clean'].transform('median')

    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age_clean', 'SibSp', 'Parch',
                'Fare', 'Embarked', 'FamilySize', 'Title',
                'Cabin_clean']

    return df[features]


# 학습 데이터 전처리
X = preprocess_data(train)
y = train['Survived']

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(preprocess_data(test))

# PyTorch 텐서로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# 모델 정의
class TitanicNet(nn.Module):
    def __init__(self, input_size):
        super(TitanicNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

# 모델 초기화
model = TitanicNet(input_size=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습
num_epochs = 500
batch_size = 32
best_val_accuracy = 0

for epoch in range(num_epochs):
    model.train()
    permutation = torch.randperm(X_train_tensor.size()[0])

    # 미니배치 학습
    for i in range(0, X_train_tensor.size()[0], batch_size):
        indices = permutation[i:i + batch_size]
        batch_x, batch_y = X_train_tensor[indices], y_train_tensor[indices]

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    # 검증 데이터 평가
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_predicted = (val_outputs > 0.5).float()
        val_accuracy = accuracy_score(y_val_tensor, val_predicted)
        print(f"Epoch [{epoch+1}/{num_epochs}], Val Accuracy: {val_accuracy:.4f}")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), "best_model.pt")

print(f"Best model saved with val_accuracy: {best_val_accuracy:.4f}")

# 테스트 데이터 예측
best_model = TitanicNet(input_size=X_test.shape[1])
best_model.load_state_dict(torch.load("best_model.pt"))
best_model.eval()

with torch.no_grad():
    test_outputs = best_model(X_test_tensor)
    test_predicted = (test_outputs > 0.5).float().squeeze()

# 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predicted.int().numpy()
})
submission.to_csv('submission07.csv', index=False)
print('\n제출 파일이 생성되었습니다.')

Epoch [1/500], Val Accuracy: 0.6927
Epoch [2/500], Val Accuracy: 0.7263
Epoch [3/500], Val Accuracy: 0.7430
Epoch [4/500], Val Accuracy: 0.7598
Epoch [5/500], Val Accuracy: 0.7654
Epoch [6/500], Val Accuracy: 0.7765
Epoch [7/500], Val Accuracy: 0.7877
Epoch [8/500], Val Accuracy: 0.7877
Epoch [9/500], Val Accuracy: 0.7877
Epoch [10/500], Val Accuracy: 0.7933
Epoch [11/500], Val Accuracy: 0.7877
Epoch [12/500], Val Accuracy: 0.7877
Epoch [13/500], Val Accuracy: 0.7877
Epoch [14/500], Val Accuracy: 0.7877
Epoch [15/500], Val Accuracy: 0.7877
Epoch [16/500], Val Accuracy: 0.7877
Epoch [17/500], Val Accuracy: 0.7877
Epoch [18/500], Val Accuracy: 0.7877
Epoch [19/500], Val Accuracy: 0.7933
Epoch [20/500], Val Accuracy: 0.7877
Epoch [21/500], Val Accuracy: 0.7933
Epoch [22/500], Val Accuracy: 0.7989
Epoch [23/500], Val Accuracy: 0.7933
Epoch [24/500], Val Accuracy: 0.7877
Epoch [25/500], Val Accuracy: 0.7877
Epoch [26/500], Val Accuracy: 0.7933
Epoch [27/500], Val Accuracy: 0.7933
Epoch [28/

  best_model.load_state_dict(torch.load("best_model.pt"))
