<a href="https://colab.research.google.com/github/rockfiller/titanic_analysis/blob/main/%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E5%AD%98%E6%B4%BB%E9%A0%90%E6%B8%AC_(4).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import tensorflow as tf
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import xgboost as xgb

def preprocess_dataset(df):
    '''
    Preprocess the Titanic dataset by handling missing values and removing irrelevant columns.

    Parameters:
        df (DataFrame): Original dataset.

    Returns:
        DataFrame: Processed dataset with cleaned and imputed values.
    '''
    # Remove irrelevant and categorical columns
    df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

    # Fill missing values in the 'Age' column with the mean age of each Pclass group
    df['Age'] = df['Age'].fillna(df.groupby('Pclass')['Age'].transform('mean'))
    # Fill missing values in the 'Age' column with the mean age
    # df['Age'] = df['Age'].fillna(df['Age'].mean())

    # Fill missing values in the 'Embarked' column with the most frequent value (mode)
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # One-Hot Encoding for the 'Sex' and 'Embarked' columns
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], dtype=int)

    # Create Fsize as a new numerical feature
    df['Fsize'] = df['SibSp'] + df['Parch'] + 1

    # Create Young as a new numerical feature
    df['Kid'] = (df['Age'] < 12).astype(int)
    return df

df = pd.read_csv('https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv')
df_train = preprocess_dataset(df)

columns_X = df_train.drop(columns=['Survived']).columns
columns_y = ['Survived']

train_X = df_train[columns_X]
train_y = df_train[columns_y]

models = {
    'Logistic Regression': LogisticRegression(random_state=0, max_iter=3000),
    'Decision Tree': DecisionTreeClassifier(max_depth=3),
    'Random Forest': RandomForestClassifier(max_depth=3, n_estimators=100),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(random_state=0)}

param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear', 'saga']
    },
    'Decision Tree': {
        'min_samples_split': [2, 5, 10],
        'max_depth': [None, 10, 20, 30]
    },
    'Random Forest': {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }}

for model_name, model in models.items():
    try:
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5)
        grid_search.fit(train_X, train_y.values.ravel())
        best_model = grid_search.best_estimator_
        best_score = grid_search.best_score_
        best_params = grid_search.best_params_
        print(f"{model_name} Best Score: {best_score:.4f} Best Parameters: {best_params}")
    except Exception as e:
        print(f"Error occurred for {model_name}: {e}")

# PyTorch Model
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(train_X.shape[1], 100)
        self.fc2 = nn.Linear(100, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

X_tensor = torch.tensor(train_X.values, dtype=torch.float32)
y_tensor = torch.tensor(train_y.values, dtype=torch.float32)
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

pytorch_model = SimpleNN()
criterion = nn.BCELoss()
optimizer = optim.Adam(pytorch_model.parameters(), lr=0.001)

for epoch in range(100):
    for batch_X, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = pytorch_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

pytorch_model.eval()
with torch.no_grad():
    pytorch_best_score = ((pytorch_model(X_tensor).numpy().flatten() > 0.5) == train_y.values.flatten()).mean()
print(f"PyTorch Best Score: {pytorch_best_score:.4f}")

# XGBoost Model
dtrain = xgb.DMatrix(train_X, label=train_y)
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}
xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_X, train_y.values.ravel())
xgb_best_score = grid_search.best_score_
xgb_best_params = grid_search.best_params_
print(f"XGBoost Best Score: {xgb_best_score:.4f} Best Parameters: {xgb_best_params}")




Logistic Regression Best Score: 0.8092 Best Parameters: {'C': 10, 'solver': 'liblinear'}
Decision Tree Best Score: 0.8137 Best Parameters: {'max_depth': 10, 'min_samples_split': 10}
Random Forest Best Score: 0.8272 Best Parameters: {'max_depth': 10, 'n_estimators': 200}
KNN Best Score: 0.7161 Best Parameters: {'n_neighbors': 9, 'weights': 'distance'}
SVM Best Score: 0.8036 Best Parameters: {'C': 100, 'kernel': 'linear'}
PyTorch Best Score: 0.8249
XGBoost Best Score: 0.8362 Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300}
