In [None]:
#必要ライブラリのインポート

In [2]:
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
#データ読み込みとtrainとtestデータの結合(train/testで同じ処理を二度書かずに済む)

In [3]:
train_path = "/workspace/input/train.csv"
test_path = "/workspace/input/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(f"train shape: {train_df.shape}")
print(f"test shape: {test_df.shape}")

full_df = pd.concat([train_df, test_df], axis=0, sort=False).reset_index(drop=True)
print(f"full shape: {full_df.shape}")

train shape: (891, 12)
test shape: (418, 11)
full shape: (1309, 12)


In [None]:
#結合データのチェック

In [4]:
full_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
full_df.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1299,1300,,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
1300,1301,,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S
1301,1302,,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q
1302,1303,,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0,C78,Q
1303,1304,,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.775,,S
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [None]:
#欠損状況を把握する

In [6]:
missing = full_df.isnull().sum().sort_values(ascending=False)
missing = missing[missing > 0]
missing

Cabin       1014
Survived     418
Age          263
Embarked       2
Fare           1
dtype: int64

In [7]:
#メモ
##Cabin:8割欠損
##Age: 中程度の欠損
##Embarked: わずかな欠損
##Fare: test側に1件欠損
##「どんな値を埋めるとデータの意味が保てるか」を考えるのが重要。

#数値かカテゴリか
 #数値（Age, Fareなど）は「平均/中央値」「類似グループの中央値」「回帰による推定」などが候補。外れ値に弱い平均より中央値を優先することが多いです。
  #カテゴリ（Embarked, Cabinなど）は「最頻値」「似た行の代表カテゴリ」「新カテゴリ（Unknown）」といった置換が一般的。

#欠損割合と原因
 #欠損が少ない（Embarkedの2件など）なら単純に最頻値で埋めても影響が小さい。
 #多い場合（Cabinはほぼ欠損）には、元の情報を正確に再現できないので、代わりに「ない/不明」というカテゴリを設けるほうが安全。

#モデルへの影響
 #行う補完が予測に偏りを与えないか。たとえばAgeを全員同じ値で埋めると、年齢による差が失われる。タイトルやPclassでグループ化した中央値で埋めれば、元の分布に近い情報を保ちやすい。
  #Fareの欠損が1件のときは、同じPclass×Embarkedの中央値で埋めると料金帯が整合する。

#特徴量エンジニアリングの計画
 #後で使う予定の派生列（FamilySize, Cabinデッキなど）を考えながら、埋め方を決める。例えばCabin欠損を Unknown としておくと、CabinDeck で「デッキ情報なし」というカテゴリを使える。
    
#再現性とシンプルさ
 #Kaggleでは複雑な補完もあり得るが、まずはロジックが明快で再現しやすい方法（中央値、最頻値、Unknownカテゴリ）を優先し、必要に応じて高度な補完（回帰、KNN、MICEなど）に進むと良いです。

In [7]:
def extract_deck(cabin):
    if pd.isna(cabin):
        return "Unknown"
    # Cabinは “C123” “F4” “E46” のように先頭がデッキ文字なので最初の文字だけ取得
    return cabin[0]

full_df["CabinDeck"] = full_df["Cabin"].apply(extract_deck)
full_df["CabinDeck"].value_counts()

CabinDeck
Unknown    1014
C            94
B            65
D            46
E            41
A            22
F            21
G             5
T             1
Name: count, dtype: int64

In [8]:
   full_df["Title"] = full_df["Name"].str.extract(r",\s*([^\.]*)\.")
   title_map = {
       "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
       "Lady": "Royal", "Countess": "Royal", "Dona": "Royal",
       "Dr": "Officer", "Rev": "Officer", "Major": "Officer", "Col": "Officer",
       "Capt": "Officer", "Sir": "Royal", "Don": "Royal", "Jonkheer": "Royal"
   }
   full_df["Title"] = full_df["Title"].replace(title_map)
   full_df["Title"] = full_df["Title"].fillna("Unknown")
   full_df["Title"].value_counts()

Title
Mr              757
Miss            264
Mrs             198
Master           61
Officer          23
Royal             5
the Countess      1
Name: count, dtype: int64

In [9]:
   age_medians = full_df.groupby(["Title", "Pclass"])["Age"].median()

   def fill_age(row):
       if pd.isna(row["Age"]):
           return age_medians.loc[row["Title"], row["Pclass"]]
       return row["Age"]

   full_df["Age"] = full_df.apply(fill_age, axis=1)

In [10]:
   full_df["Embarked"].fillna(full_df["Embarked"].mode()[0], inplace=True)

In [11]:
   fare_medians = full_df.groupby(["Pclass", "Embarked"])["Fare"].median()

   def fill_fare(row):
       if pd.isna(row["Fare"]):
            return fare_medians.loc[row["Pclass"], row["Embarked"]]
       return row["Fare"]

   full_df["Fare"] = full_df.apply(fill_fare, axis=1)

In [12]:
full_df["Embarked"].isnull().sum()

0

In [13]:
full_df["Fare"].isnull().sum() 

0

In [14]:
#特徴量エンジニアリングに入る
##家族関連とチケット関連の特徴を作る

In [15]:
#FamilySize は同乗家族の合計人数（自分を含めて +1）。
#IsAlone は1人旅なら1、そうでなければ0。

In [16]:
full_df["FamilySize"] = full_df["SibSp"] + full_df["Parch"] + 1
full_df["IsAlone"] = (full_df["FamilySize"] == 1).astype(int)

In [17]:
#同じチケットを持つ人数を数え、4人以上は「4+」扱いなどにまとめます（Notebookでも似たようなビニングをしています）

In [18]:
ticket_counts = full_df["Ticket"].value_counts()
full_df["TicketGroup"] = full_df["Ticket"].map(ticket_counts)

# 大人数グループをまとめる場合は適宜切り分け
full_df["TicketGroup"] = full_df["TicketGroup"].apply(
    lambda x: 4 if x >= 4 else x
)

In [19]:
#Age/Fareのビニング

In [20]:
   full_df["AgeBin"] = pd.qcut(full_df["Age"], 4, labels=False)
   full_df["FareBin"] = pd.qcut(full_df["Fare"], 4, labels=False)

In [21]:
#カテゴリーのエンコード

In [22]:
categorical_cols = [
    "Sex", "Embarked", "CabinDeck", "Title",
    "AgeBin", "FareBin", "TicketGroup", "IsAlone"
]

full_encoded = pd.get_dummies(full_df, columns=categorical_cols, drop_first=True)

In [23]:
#train/testへ分割し直し

In [24]:
   train_processed = full_df.iloc[:len(train_df)]
   test_processed = full_df.iloc[len(train_df):]
   X = train_processed.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin"], axis=1)
   y = train_processed["Survived"]
   X_test = test_processed[X.columns]

In [25]:
#特徴量と目的変数の準備

In [21]:
   # モデルに使わない列を除外
   drop_cols = ["PassengerId", "Survived", "Name", "Ticket", "Cabin"]
   X = full_df.iloc[:len(train_df)].drop(columns=drop_cols)
   y = train_df["Survived"]
   X_test = full_df.iloc[len(train_df):].drop(columns=drop_cols)

In [23]:
   categorical_cols = ["Sex", "Embarked", "CabinDeck", "Title",
                       "AgeBin", "FareBin", "TicketGroup", "IsAlone"]
   X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
   X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

   # train/testで列数がずれる場合は揃える
   X_test = X_test.reindex(columns=X.columns, fill_value=0)

In [None]:
#3種類のモデルで5分割交差検証を行い、平均精度とばらつきを比較

In [24]:
   models = {
       "LogisticRegression": LogisticRegression(max_iter=1000),
       "RandomForest": RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42),
       "GradientBoosting": GradientBoostingClassifier(random_state=42)
   }

   cv = KFold(n_splits=5, shuffle=True, random_state=42)

   for name, model in models.items():
       scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
       print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

LogisticRegression: 0.8249 ± 0.0296
RandomForest: 0.8316 ± 0.0225
GradientBoosting: 0.8294 ± 0.0235


In [None]:
#交差検証で良かったモデルを全データで学習し、テストデータの予測を submission_v2.csv にまとめる

In [25]:
   best_model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)
   best_model.fit(X, y)
   test_pred = best_model.predict(X_test)

   submission = pd.DataFrame({
       "PassengerId": test_df["PassengerId"],
       "Survived": test_pred
   })
   submission.to_csv("submission_v2.csv", index=False)

In [26]:
def family_category(size):
    if size == 1:
        return "Single"
    elif 2 <= size <= 4:
        return "Small"
    else:
        return "Large"

full_df["FamilyCategory"] = full_df["FamilySize"].apply(family_category)

In [27]:
ticket_counts = full_df["Ticket"].value_counts()
full_df["TicketGroupSize"] = full_df["Ticket"].map(ticket_counts)

def ticket_group(size):
    if size == 1:
        return "Solo"
    elif 2 <= size <= 3:
        return "Small"
    else:
        return "Large"

full_df["TicketGroupCategory"] = full_df["TicketGroupSize"].apply(ticket_group)

In [28]:
rare_titles = ["Dr", "Rev", "Major", "Col", "Sir", "Lady", "Countess", "Capt", "Don", "Jonkheer", "Dona", "Mme", "Mlle", "Ms"]
full_df["Title"] = full_df["Title"].replace(
    {
        "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
        "Dr": "Officer", "Rev": "Officer", "Major": "Officer",
        "Col": "Officer", "Capt": "Officer",
        "Sir": "Royal", "Lady": "Royal", "Countess": "Royal",
        "Don": "Royal", "Jonkheer": "Royal", "Dona": "Royal"
    }
)

In [29]:
   train_processed = full_df.iloc[:len(train_df)].copy()
   test_processed = full_df.iloc[len(train_df):].copy()

In [30]:
   drop_cols = ["PassengerId", "Survived", "Name", "Ticket", "Cabin"]
   X = train_processed.drop(columns=drop_cols)
   y = train_df["Survived"]
   X_test = test_processed.drop(columns=drop_cols)

In [31]:
   "AgeBin" in full_df.columns, "FareBin" in full_df.columns
   full_df[["Age", "AgeBin", "Fare", "FareBin"]].head()

Unnamed: 0,Age,AgeBin,Fare,FareBin
0,22.0,1,7.25,0
1,38.0,3,71.2833,3
2,26.0,1,7.925,1
3,35.0,2,53.1,3
4,35.0,2,8.05,1


In [32]:
   full_df["AgeBin"] = pd.qcut(full_df["Age"], 4, labels=False)
   full_df["FareBin"] = pd.qcut(full_df["Fare"], 4, labels=False)

In [33]:
   "AgeBin" in full_df.columns, "FareBin" in full_df.columns
   full_df[["Age", "AgeBin", "Fare", "FareBin"]].head()

Unnamed: 0,Age,AgeBin,Fare,FareBin
0,22.0,1,7.25,0
1,38.0,3,71.2833,3
2,26.0,1,7.925,1
3,35.0,2,53.1,3
4,35.0,2,8.05,1


In [34]:
   train_processed = full_df.iloc[:len(train_df)].copy()
   test_processed = full_df.iloc[len(train_df):].copy()
   X = train_processed.drop(columns=drop_cols)
   X_test = test_processed.drop(columns=drop_cols)

In [36]:
# train側にある列を基準に、test側を揃える
X_test = X_test.reindex(columns=X.columns, fill_value=0)

print(X.shape, X_test.shape)  # 両方とも (891, 36) / (418, 36) になるはずです

(891, 36) (418, 36)


In [37]:
   categorical_cols = [
       "Sex", "Embarked", "CabinDeck", "Title",
       "AgeBin", "FareBin", "TicketGroupCategory", "FamilyCategory"
   ]

   X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
   X_test = pd.get_dummies(test_processed.drop(columns=drop_cols), columns=categorical_cols, drop_first=True)

   # 列数確認
   print(X.shape, X_test.shape)

KeyError: "None of [Index(['Sex', 'Embarked', 'CabinDeck', 'Title', 'AgeBin', 'FareBin',\n       'TicketGroupCategory', 'FamilyCategory'],\n      dtype='object')] are in the [columns]"

In [14]:
   models = {
       "LogisticRegression": LogisticRegression(max_iter=1000),
       "RandomForest": RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42),
       "GradientBoosting": GradientBoostingClassifier(random_state=42)
   }

   cv = KFold(n_splits=5, shuffle=True, random_state=42)

   for name, model in models.items():
       scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
       print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

NameError: name 'X' is not defined