In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# データの読込

In [2]:
# データを読み込む
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")

train_y = train['Transported'].copy()
train_y.head()

0    False
1     True
2    False
3    False
4     True
Name: Transported, dtype: bool

In [3]:
# データを読み込む
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [4]:
# 訓練データの可視化
train.describe()

# 数値データは Age, RoomService, FoodCount, ShoppingMall, Spa, VRDeck
# カテゴリ変数: HomePlanet, CryoSleep, Cabin, Destination, VIP, Name

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


# 前処理

* 数値データの欠損値は平均値で補完する
* カテゴリ変数はOneHotEncoderで数値化する

In [5]:
for c in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    value = train[c].mean()
    print("Train-{}_value:{}".format(c, value))
    train[c].fillna(value, inplace=True)

    value = test[c].mean()
    print("Test-{}_value:{}".format(c, value))
    test[c].fillna(value, inplace=True)


Train-Age_value:28.82793046746535
Test-Age_value:28.65814620162446
Train-RoomService_value:224.687617481203
Test-RoomService_value:219.26626936829558
Train-FoodCourt_value:458.07720329024676
Test-FoodCourt_value:439.4842963318149
Train-ShoppingMall_value:173.72916912197996
Test-ShoppingMall_value:177.29552524527398
Train-Spa_value:311.1387779083431
Test-Spa_value:303.05244252873564
Train-VRDeck_value:304.8547912992357
Test-VRDeck_value:310.71003097450557


In [6]:
train.describe()
# 欠損値を平均値で補完すると、標準偏差が変わってしまった。

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.339054,659.739364,1594.434978,597.41744,1124.675871,1133.259049
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,78.0,118.0,45.0,89.0,71.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [7]:
# Cabin情報を分割する
train['Cabin1'] = train['Cabin'].copy()
train['Cabin1'].fillna('', inplace=True)
train['Cabin1'] = train['Cabin1'].apply(lambda x: x[0:1] + x[2:3])

train['Side'] = train['Cabin'].copy()
train['Side'].fillna('', inplace=True)
train['Side'] = train['Side'].apply(lambda x: x[-1:])

test['Cabin1'] = test['Cabin'].copy()
test['Cabin1'].fillna('', inplace=True)
test['Cabin1'] = test['Cabin1'].apply(lambda x: x[0:1] + x[2:3])

test['Side'] = test['Cabin'].copy()
test['Side'].fillna('', inplace=True)
test['Side'] = test['Side'].apply(lambda x: x[-1:])


In [8]:
# 不要な特徴を削除する
train.drop(labels=['PassengerId', 'Cabin', 'Name', 'Transported'], axis=1, inplace=True)

test.drop(labels=['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)
test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin1,Side
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G3,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F4,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C0,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C1,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F5,S


In [9]:
# HomePlanet, CryoSleep, Destination, VIPをOneHoeEncodingする
for c in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin1', 'Side']:
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoder.fit(train.loc[:,[c]])
    
    encoded_train = encoder.transform(train.loc[:,[c]])
    encoded_train_df = pd.DataFrame(encoded_train.toarray().astype('int64'), columns=encoder.get_feature_names_out())
    
    encoded_test = encoder.transform(test.loc[:,[c]])
    encoded_test_df = pd.DataFrame(encoded_test.toarray().astype('int64'), columns=encoder.get_feature_names_out())
    
    train = pd.concat([train, encoded_train_df], axis=1).drop([c], axis=1)
    test = pd.concat([test, encoded_test_df], axis=1).drop([c], axis=1)

test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,...,Cabin1_G7,Cabin1_G8,Cabin1_G9,Cabin1_T0,Cabin1_T1,Cabin1_T2,Cabin1_T3,Side_,Side_P,Side_S
0,27.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,19.0,0.0,9.0,0.0,2823.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,31.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,38.0,0.0,6652.0,0.0,181.0,585.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,20.0,10.0,0.0,635.0,0.0,0.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# モデル作成

* SandardScalerによる標準化
* XGBClassifierによる推論

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

model = xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)

pipe = Pipeline(steps=[("standard_scaler", StandardScaler()),
                      ("Classifier", model)])

from sklearn import set_config
set_config(display='diagram')
pipe

In [11]:
# 訓練データを分割する
train_X, test_X, train_Y, test_Y = train_test_split(train, train_y, random_state=42)

# 分割したデータで訓練
pipe.fit(train_X, train_Y.values.ravel())

# 推論
pipe.predict(test_X)

# スコアを確認する
pipe.score(test_X, test_Y)

0.7939282428702852

# 交差検証
from sklearn.model_selection import cross_val_score

scores = cross_val_score(pipe, train, train_y.values.ravel(), cv=5, scoring='accuracy')
np.mean(scores)

# 提出

In [12]:
# testデータを推論
pred = pipe.predict(test)

# 提出用のファイルを作成する
submission = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
submission['Transported'] = list(map(bool, pred))

submission.to_csv('submission.csv', index=False)