In [8]:
import pandas as pd

# 絕對路徑：指向你的桌面上的 titanic 資料夾
train_path = "/Users/sum/Desktop/titanic/train.csv"
test_path = "/Users/sum/Desktop/titanic/test.csv"

# 讀取資料
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# 顯示前幾列確認
print("Train CSV:")
print(train_df.head())

print("\nTest CSV:")
print(test_df.head())


Train CSV:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN   

In [28]:
#!/usr/bin/env python3

import os
import sys
import logging
import subprocess
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

def load_data(train_path: str, test_path: str):
    """
    載入訓練與測試 CSV 資料，回傳 DataFrame。
    """
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        logging.info(f"Train data shape: {train_df.shape}")
        logging.info(f"Test data shape: {test_df.shape}")
        return train_df, test_df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        sys.exit(1)

def preprocess_data(df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
    """
    Titanic 資料預處理與特徵工程，包括：
      - Title：從 Name 提取並做簡單編碼。
      - 補缺失值：Age, Fare, Embarked。
      - 分箱 (binning)：Age, Fare。
      - 建立 FamilySize, IsAlone。
      - 刪除無用欄位 (Name, Ticket, Cabin, SibSp, Parch, 以及若 is_train=True 則刪除 PassengerId)。

    回傳處理後的 DataFrame。
    """
    df = df.copy()
    
    # --- 1. 提取 Title 並做映射 ---
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    title_mapping = {
        'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3,
        'Dr': 3, 'Rev': 3, 'Col': 3, 'Major': 3,
        'Mlle': 3, 'Countess': 3, 'Ms': 3, 'Lady': 3,
        'Jonkheer': 3, 'Don': 3, 'Dona': 3, 'Mme': 3,
        'Capt': 3, 'Sir': 3
    }
    df['Title'] = df['Title'].map(title_mapping).fillna(3)
    
    # --- 2. Sex 與 Embarked 編碼 ---
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].fillna('S')
    embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
    df['Embarked'] = df['Embarked'].map(embarked_mapping)
    
    # --- 3. 填補 Age 與 Fare ---
    # 先依 Title 分組補 Age
    df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
    df['Age'] = df['Age'].fillna(df['Age'].median())
    
    # 依 Pclass 分組補 Fare
    df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # --- 4. 建立 FamilySize、IsAlone ---
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # --- 5. Age 與 Fare 進行分箱 (binning) ---
    # Age bins
    df.loc[df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[df['Age'] > 64, 'Age'] = 4

    # Fare bins
    df.loc[df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare'] = 2
    df.loc[df['Fare'] > 31, 'Fare'] = 3

    # --- 6. 刪除無用欄位 ---
    drop_cols = ['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
    df.drop(columns=drop_cols, axis=1, inplace=True)
    
    if is_train and 'PassengerId' in df.columns:
        df.drop('PassengerId', axis=1, inplace=True)
    
    return df

def build_voting_model():
    """
    建立 VotingClassifier (soft voting)，包含：
      - RandomForestClassifier
      - GradientBoostingClassifier
      - LogisticRegression
    可視需要調整參數。
    """
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=6,
        min_samples_split=4,
        min_samples_leaf=2,
        random_state=42
    )
    gb = GradientBoostingClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        random_state=42
    )
    lr = LogisticRegression(
        C=1.0,
        solver='liblinear',
        random_state=42
    )
    
    voting_clf = VotingClassifier(
        estimators=[('rf', rf), ('gb', gb), ('lr', lr)],
        voting='soft'  # 使用預測機率做投票
    )
    return voting_clf

def main():
    # --- File Paths ---
    TRAIN_PATH = "/Users/sum/Desktop/titanic/train.csv"
    TEST_PATH = "/Users/sum/Desktop/titanic/test.csv"
    
    logging.info("Loading data...")
    train_df, test_df = load_data(TRAIN_PATH, TEST_PATH)
    
    # --- Preprocess Data ---
    logging.info("Preprocessing training data...")
    train_processed = preprocess_data(train_df, is_train=True)
    logging.info("Preprocessing test data...")
    test_processed = preprocess_data(test_df, is_train=False)
    
    # --- Prepare Training Data ---
    X_train = train_processed.drop("Survived", axis=1)
    y_train = train_processed["Survived"]
    logging.info(f"Train features shape: {X_train.shape}, Target shape: {y_train.shape}")
    
    # --- Build Model ---
    model = build_voting_model()
    
    # --- Cross-validation ---
    logging.info("Performing cross-validation...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    logging.info(f"CV scores: {cv_scores}")
    logging.info(f"Mean CV accuracy: {cv_scores.mean() * 100:.2f}%")
    
    # --- Fit Model on Full Training ---
    logging.info("Fitting model on full training data...")
    model.fit(X_train, y_train)
    
    # --- Predict on Test Data ---
    # 測試資料保留 PassengerId，用於提交結果
    X_test = test_processed.drop("PassengerId", axis=1)
    predictions = model.predict(X_test)
    logging.info("Predictions generated on test data.")
    
    # --- Create Submission File ---
    submission = pd.DataFrame({
        "PassengerId": test_processed["PassengerId"],
        "Survived": predictions
    })
    
    desktop_dir = os.path.expanduser("~/Desktop/titanic")
    os.makedirs(desktop_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    submission_filename = f"Submission_{timestamp}.csv"
    submission_filepath = os.path.join(desktop_dir, submission_filename)
    
    try:
        submission.to_csv(submission_filepath, index=False)
        logging.info(f"Submission file saved to: {submission_filepath}")
    except Exception as e:
        logging.error(f"Error saving submission file: {e}")
        sys.exit(1)
    
    # --- Attempt to open submission file on macOS ---
    try:
        subprocess.run(["open", submission_filepath], check=True)
        logging.info("Submission file opened successfully.")
    except Exception as e:
        logging.error(f"Error opening submission file: {e}")

if __name__ == "__main__":
    main()



2025-04-12 17:20:02 [INFO] Loading data...
2025-04-12 17:20:02 [INFO] Train data shape: (891, 12)
2025-04-12 17:20:02 [INFO] Test data shape: (418, 11)
2025-04-12 17:20:02 [INFO] Preprocessing training data...
2025-04-12 17:20:02 [INFO] Preprocessing test data...
2025-04-12 17:20:02 [INFO] Train features shape: (891, 8), Target shape: (891,)
2025-04-12 17:20:02 [INFO] Performing cross-validation...
2025-04-12 17:20:03 [INFO] CV scores: [0.82681564 0.82022472 0.83146067 0.84269663 0.84269663]
2025-04-12 17:20:03 [INFO] Mean CV accuracy: 83.28%
2025-04-12 17:20:03 [INFO] Fitting model on full training data...
2025-04-12 17:20:04 [INFO] Predictions generated on test data.
2025-04-12 17:20:04 [INFO] Submission file saved to: /Users/sum/Desktop/titanic/Submission_20250412_172004.csv
2025-04-12 17:20:04 [INFO] Submission file opened successfully.
