# 필요한 라이브러리 임포트

In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# feature encode(더미 생성후 원-핫 인코딩 방식)

In [42]:
# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 결측치 처리
train['Age'] = train['Age'].fillna(train['Age'].median()) # mean, median
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())

# 가족 크기 계산
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

# 이름에서 직함 추출
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# 직함 그룹화
Other_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir.', 'Jonkheer', 'Dona']
train.loc[train['Title'].isin(Other_titles), 'Title'] = 'Other'
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

# 직함 매핑
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

# 객실 정보 처리
train['HasCabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

# 범주형 변수 처리
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
train['Embarked'] = train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

#-------------------------------------------------------------------------------------------------------

# 원-핫 인코딩을 위한 더미 변수 생성
embarked_dummies = pd.get_dummies(train['Embarked'], prefix='Emb')
pclass_dummies = pd.get_dummies(train['Pclass'], prefix='Pclass')

# 최종 특성 선택
features = ['Sex', 'Age', 'Fare', 'FamilySize', 'Title', 'HasCabin', 'SibSp', 'Parch']

# 제외한 특성 -
#-------------------------------------------------------------------------------------------------------

# 최종 데이터프레임 생성
X = pd.concat([
    train[features],
    embarked_dummies,
    pclass_dummies
], axis=1)

# 타겟 변수
y = train['Survived']

print("전처리된 특성 개수:", X.shape)
print("전처리된 특성 목록:", list(X.columns))

전처리된 특성 개수: (891, 14)
전처리된 특성 목록: ['Sex', 'Age', 'Fare', 'FamilySize', 'Title', 'HasCabin', 'SibSp', 'Parch', 'Emb_0', 'Emb_1', 'Emb_2', 'Pclass_1', 'Pclass_2', 'Pclass_3']


# feature encode(라벨 인코딩 방식)

In [35]:
# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 결측치 처리
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())

# 가족 크기 계산
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

# 이름에서 직함 추출
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# 직함 그룹화
Other_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir.', 'Jonkheer', 'Dona']
train.loc[train['Title'].isin(Other_titles), 'Title'] = 'Other'
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

# 직함 매핑
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

# 객실 정보 처리
train['HasCabin'] = train['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

# Label 인코딩 사용
label_encoder = LabelEncoder()
train['Embarked_encoded'] = label_encoder.fit_transform(train['Embarked'])
train['Pclass_encoded'] = label_encoder.fit_transform(train['Pclass'].astype(str))

# 최종 특성 선택
features = ['Sex', 'Age', 'Fare', 'FamilySize', 'Title', 'HasCabin',
           'SibSp', 'Parch', 'Embarked_encoded', 'Pclass_encoded']

# 최종 데이터프레임 생성
X = pd.concat([
    train[features]
], axis=1)

# 타겟 변수
y = train['Survived']

print("전처리된 특성 개수:", X.shape)
print("전처리된 특성 목록:", list(X.columns))

전처리된 특성 개수: (891, 10)
전처리된 특성 목록: ['Sex', 'Age', 'Fare', 'FamilySize', 'Title', 'HasCabin', 'SibSp', 'Parch', 'Embarked_encoded', 'Pclass_encoded']


# 데이터 스케일링 및 훈련 모델 학습 과정

In [43]:
# 데이터 스케일링
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# PyTorch 텐서로 변환
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train.values)
X_val = torch.FloatTensor(X_val)
y_val = torch.FloatTensor(y_val.values)

# 모델 정의
class TitanicNet(nn.Module):
    def __init__(self, input_size):
        super(TitanicNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# 모델 초기화
model = TitanicNet(X_train.shape[1])

# 손실 함수와 옵티마이저 정의
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습
num_epochs = 5000
batch_size = 32

for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 검증
model.eval()
with torch.no_grad():
    val_outputs = model(X_val)
    val_predictions = (val_outputs > 0.5).float()
    accuracy = (val_predictions == y_val.unsqueeze(1)).float().mean()
    print(f'Validation Accuracy: {accuracy.item():.4f}')

Epoch [10/5000], Loss: 0.1851
Epoch [20/5000], Loss: 0.1732
Epoch [30/5000], Loss: 0.1714
Epoch [40/5000], Loss: 0.1706
Epoch [50/5000], Loss: 0.1678
Epoch [60/5000], Loss: 0.1685
Epoch [70/5000], Loss: 0.1682
Epoch [80/5000], Loss: 0.1683
Epoch [90/5000], Loss: 0.1672
Epoch [100/5000], Loss: 0.1611
Epoch [110/5000], Loss: 0.1559
Epoch [120/5000], Loss: 0.1484
Epoch [130/5000], Loss: 0.1418
Epoch [140/5000], Loss: 0.1371
Epoch [150/5000], Loss: 0.1304
Epoch [160/5000], Loss: 0.1246
Epoch [170/5000], Loss: 0.1205
Epoch [180/5000], Loss: 0.1171
Epoch [190/5000], Loss: 0.1158
Epoch [200/5000], Loss: 0.1138
Epoch [210/5000], Loss: 0.1111
Epoch [220/5000], Loss: 0.1088
Epoch [230/5000], Loss: 0.1052
Epoch [240/5000], Loss: 0.1041
Epoch [250/5000], Loss: 0.1041
Epoch [260/5000], Loss: 0.1017
Epoch [270/5000], Loss: 0.1002
Epoch [280/5000], Loss: 0.0991
Epoch [290/5000], Loss: 0.0994
Epoch [300/5000], Loss: 0.0979
Epoch [310/5000], Loss: 0.0997
Epoch [320/5000], Loss: 0.0983
Epoch [330/5000],

# 똑같이 test data 전처리 후 submission으로 저장

In [44]:
# 테스트 데이터 전처리 (학습 데이터와 동일한 방식으로)
test['Age'] = test['Age'].fillna(test['Age'].median())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].mode()[0])

# 가족 크기 계산
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

# 이름에서 직함 추출
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# 직함 그룹화
test.loc[test['Title'].isin(Other_titles), 'Title'] = 'Other'
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

# 직함 매핑
test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)

# 객실 정보 처리
test['HasCabin'] = test['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

# 범주형 변수 처리
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})
test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# 더미 변수 생성
test_embarked_dummies = pd.get_dummies(test['Embarked'], prefix='Emb')
test_pclass_dummies = pd.get_dummies(test['Pclass'], prefix='Pclass')

# 최종 테스트 데이터프레임 생성
X_test = pd.concat([
    test[features],
    test_embarked_dummies,
    test_pclass_dummies
], axis=1)

# 스케일링 적용
X_test = scaler.transform(X_test)

# PyTorch 텐서로 변환
X_test = torch.FloatTensor(X_test)

# 예측 수행
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_predictions = (test_outputs > 0.5).float()

# 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions.numpy().flatten().astype(int)
})

# CSV 파일로 저장
submission.to_csv('submission03.csv', index=False)
print("제출 파일 생성: submission.csv")

제출 파일 생성: submission.csv
