In [1]:
# Titanic Top Solution (Clean Version)

Brendan45774 notebook を参考に、特徴量エンジニアリングと複数モデルの比較を行い、Public LB 0.82前後を目指します。

SyntaxError: invalid character '、' (U+3001) (4072854192.py, line 3)

In [2]:
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
## データ読み込み

`train.csv` と `test.csv` を読み込み、共通前処理のために結合します。理のために結合します。

In [3]:
train_path = "/workspace/input/train.csv"
test_path = "/workspace/input/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(f"train shape: {train_df.shape}")
print(f"test shape: {test_df.shape}")

full_df = pd.concat([train_df, test_df], axis=0, sort=False).reset_index(drop=True)
print(f"full shape: {full_df.shape}")

train shape: (891, 12)
test shape: (418, 11)
full shape: (1309, 12)


In [None]:
## 欠損確認

列ごとの欠損数を集計し、対応方針を決めます。

In [4]:
missing = full_df.isnull().sum().sort_values(ascending=False)
missing[missing > 0]

Cabin       1014
Survived     418
Age          263
Embarked       2
Fare           1
dtype: int64

In [None]:
### Title抽出とレア統合

名前から敬称を抜き出し、レアな敬称を代表カテゴリへまとめます。

In [5]:
full_df["Title"] = full_df["Name"].str.extract(r",\s*([^\.]*)\.")
title_map = {
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
    "Dr": "Officer", "Rev": "Officer", "Major": "Officer", "Col": "Officer", "Capt": "Officer",
    "Sir": "Royal", "Lady": "Royal", "Countess": "Royal", "Don": "Royal", "Jonkheer": "Royal", "Dona": "Royal"
}
full_df["Title"] = full_df["Title"].replace(title_map).fillna("Unknown")
full_df["Title"].value_counts()

Title
Mr              757
Miss            264
Mrs             198
Master           61
Officer          23
Royal             5
the Countess      1
Name: count, dtype: int64

In [None]:
### Cabinデッキ

Cabinの先頭文字をデッキ情報として抽出。欠損は `Unknown`。

In [6]:
def extract_deck(cabin):
    if pd.isna(cabin):
        return "Unknown"
    return cabin[0]

full_df["CabinDeck"] = full_df["Cabin"].apply(extract_deck)
full_df["CabinDeck"].value_counts()

CabinDeck
Unknown    1014
C            94
B            65
D            46
E            41
A            22
F            21
G             5
T             1
Name: count, dtype: int64

In [None]:
### 家族関連特徴

家族人数、単身フラグ、カテゴリ化を作成。

In [7]:
full_df["FamilySize"] = full_df["SibSp"] + full_df["Parch"] + 1
full_df["IsAlone"] = (full_df["FamilySize"] == 1).astype(int)

def family_category(size):
    if size == 1:
        return "Single"
    elif 2 <= size <= 4:
        return "Small"
    else:
        return "Large"

full_df["FamilyCategory"] = full_df["FamilySize"].apply(family_category)

In [None]:
### Ticketグループ

同じチケットを持つ人数を数え、3区分にビニング。

In [8]:
ticket_counts = full_df["Ticket"].value_counts()

def ticket_category(ticket):
    size = ticket_counts[ticket]
    if size == 1:
        return "Solo"
    elif 2 <= size <= 3:
        return "Small"
    else:
        return "Large"

full_df["TicketGroupCategory"] = full_df["Ticket"].apply(ticket_category)

In [None]:
### AgeとFareの補完・ビニング

In [9]:
# Embarked 最頻値
full_df["Embarked"].fillna(full_df["Embarked"].mode()[0], inplace=True)

# Fare (test側の欠損をPclass×Embarked中央値で補完)
fare_medians = full_df.groupby(["Pclass", "Embarked"])["Fare"].median()
full_df["Fare"] = full_df.apply(
    lambda row: fare_medians.loc[row["Pclass"], row["Embarked"]] if pd.isna(row["Fare"]) else row["Fare"],
    axis=1
)

# Age (Title×Pclass中央値)
age_medians = full_df.groupby(["Title", "Pclass"])["Age"].median()
full_df["Age"] = full_df.apply(
    lambda row: age_medians.loc[row["Title"], row["Pclass"]] if pd.isna(row["Age"]) else row["Age"],
    axis=1
)

# ビニング
full_df["AgeBin"] = pd.qcut(full_df["Age"], 4, labels=False)
full_df["FareBin"] = pd.qcut(full_df["Fare"], 4, labels=False)

In [None]:
## train/test再分割 & One-Hot

前処理後の `full_df` をtrain/testに戻し、カテゴリ列をOne-Hot化。

In [10]:
train_processed = full_df.iloc[:len(train_df)].copy()
test_processed = full_df.iloc[len(train_df):].copy()

drop_cols = ["PassengerId", "Survived", "Name", "Ticket", "Cabin"]
categorical_cols = [
    "Sex", "Embarked", "CabinDeck", "Title",
    "AgeBin", "FareBin", "FamilyCategory", "TicketGroupCategory"
]

X = train_processed.drop(columns=drop_cols)
X_test = test_processed.drop(columns=drop_cols)

X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
X_test = X_test.reindex(columns=X.columns, fill_value=0)

y = train_df["Survived"]

print(X.shape, X_test.shape)

(891, 34) (418, 34)


In [None]:
## モデル比較（5-fold CV）

In [11]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

LogisticRegression: 0.8238 ± 0.0272
RandomForest: 0.8350 ± 0.0220
GradientBoosting: 0.8204 ± 0.0252


In [None]:
## 最終モデル学習と提出ファイル作成

In [12]:
best_model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)
best_model.fit(X, y)

test_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": test_pred
})
submission.to_csv("submission_topsolution.csv", index=False)

In [None]:
##提出コマンドをdockerターミナルで打つ
kaggle competitions submit -c titanic -f submission_topsolution.csv -m "Top solution clean notebook"