- null特徴量を作成

In [None]:
!pip install sweetviz

In [None]:
import numpy as np
import pandas as pd
import warnings

input_dir = '../input/spaceship-titanic'
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv(f'{input_dir}/train.csv')
test = pd.read_csv(f'{input_dir}/test.csv')
sample_submission = pd.read_csv(f'{input_dir}/sample_submission.csv')

In [None]:
train.isnull().astype(int)

In [None]:
# 効きそうな特徴量作成
def make_feature(df):
    # nullかどうかの特徴量を作成
    if 'Transported' in df.columns:
        df_null = df.drop(['Transported', 'PassengerId'], axis=1).isnull().astype(int)
    else:
        df_null = df.drop(['PassengerId'], axis=1).isnull().astype(int)
    df = df.join(df_null, rsuffix='_nan')
    
    # Cabinは、X/000/Xで構成されているため、それぞれ抜き出し
    df['Cabin_1'] = df['Cabin'].str.extract('(.+)/\d+/.+')
    df['Cabin_2'] = df['Cabin'].str.extract('.+/(\d+)/.+').astype(float)
    df['Cabin_3'] = df['Cabin'].str.extract('.+/\d+/(.+)')
    
    # null埋め
    df['RoomService'] = df['RoomService'].fillna(0)
    df['FoodCourt'] = df['FoodCourt'].fillna(0)
    df['ShoppingMall'] = df['ShoppingMall'].fillna(0)
    df['Spa'] = df['Spa'].fillna(0)
    df['VRDeck'] = df['VRDeck'].fillna(0)
    df['CryoSleep'] = df['CryoSleep'].fillna(0)
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['VIP'] = df['VIP'].fillna(0)
    
    # RoomService, FoodCourt, ShoppingMall, Spa, VRDeckそれぞれについて、0と0以外で大きな差があったため0orNotの特徴量作成
    # (nullも0として扱う)
    df['Room_0'] = np.where((df['RoomService']==0) | (df['RoomService'].isnull()), 1, 0)
    df['Food_0'] = np.where((df['FoodCourt']==0) | (df['FoodCourt'].isnull()), 1, 0)
    df['Shopping_0'] = np.where((df['ShoppingMall']==0) | (df['ShoppingMall'].isnull()), 1, 0)
    df['Spa_0'] = np.where((df['Spa']==0) | (df['Spa'].isnull()), 1, 0)
    df['VR_0'] = np.where((df['VRDeck']==0) | (df['VRDeck'].isnull()), 1, 0)
    
    # 上記のそれぞれについて、お金を使っていない場所が少ないほど乗客が少ないという仮説を立てたので、上記のフラグを合計する変数を作成
    df['0_Place_num'] = df['Room_0'] + df['Food_0'] + df['Shopping_0'] + df['Spa_0'] + df['VR_0']
    
    # 上記のそれぞれについて、合計額を特徴量として持たせる
    df['usedMoneySum'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    
    # PassengerIdの"_"前が同じ=一緒に来ているグループ(と思われる)ので、グループごとの人数を特徴量として作成
    df['PassengerId_before'] = df['PassengerId'].str.extract('(\d+)_\d+').astype(int)
    group_num = df[['PassengerId_before', 'PassengerId']].groupby('PassengerId_before').count().reset_index()
    group_num = group_num.rename(columns={'PassengerId': 'group_num'})
    df = df.merge(group_num, how='left', on='PassengerId_before')
    df.drop('PassengerId_before', axis=1, inplace=True)
    
    return df

train = make_feature(train)
test = make_feature(test)

In [None]:
# 分析用の加工
def make_df_for_analytics(df, categorical_cols):
    from sklearn.preprocessing import LabelEncoder
    # cabinを削除
    df = df.drop(['Cabin', 'PassengerId', 'Name'], axis=1)
    
    # カテゴリ変数(HomePlanet, Destination, Cabin_1, Cabin_3)の数値変換
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        
    # True_Falseの1, 0変換
    df['CryoSleep'] = df['CryoSleep'].astype(float)
    df['VIP'] = df['VIP'].astype(float)
    if 'Transported' in df.columns:
        df['Transported'] = df['Transported'].astype(int)
    
    return df
train = make_df_for_analytics(train, categorical_cols=['HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3'])
test = make_df_for_analytics(test, categorical_cols=['HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3'])

In [None]:
# lightGBMを学習させる関数を定義
import optuna.integration.lightgbm as lgb
from sklearn.metrics import accuracy_score

def fit_lgbm(X_train, y_train, X_valid, y_valid, params_lgb, verbose=-1):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    lgb_results = {}  
    model_lgb = lgb.train(params_lgb, lgb_train, valid_sets=[lgb_train, lgb_valid], 
                          valid_names=['train', 'valid'], num_boost_round=100, early_stopping_rounds=50, 
                          evals_result=lgb_results, verbose_eval=verbose)
    pred_lgb_prob = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    # 0.5以上を正例として出力しているが、ここも改善可能？
    pred_lgb = np.where(pred_lgb_prob>=0.5, 1, 0)
    score_lgb = accuracy_score(pred_lgb, y_valid)
    
    return model_lgb, score_lgb, lgb_results

params_lgb = {
              'task': 'train',              # タスクを訓練に設定
              'boosting_type': 'gbdt',      # GBDTを指定
              'objective': 'binary',        # 回帰を指定
              'metric': 'binary_logloss',   # 回帰の損失（誤差）
              'learning_rate': 0.1,         # 学習率
              'seed': 10,
              'num_boost_round': 1000
              }

In [None]:
# import sweetviz
# sv_report = sweetviz.compare([train, 'train'], [test, 'test'], 'Transported')
# sv_report.show_html('sv_report.html')

In [None]:
X = train.drop('Transported', axis=1)
y = train['Transported']

## trainデータを、trainとvalidに分割（hold_outによるやり方）
# from sklearn.model_selection import train_test_split

# X_train, X_valid, y_train, y_valid = train_test_split(
#     X, y, test_size=0.2, random_state=0
# )

# trainデータを、trainとvalidに分割（KFoldによるやり方）
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=0)

scores = []
results = []
models = []

best_params, history = {}, []
cv_result_opt = []

# KFoldによるクロスバリデーションを行うためのforループ
for i, (tr_idx, va_idx) in enumerate(kf.split(X)):
    # trainとvalidデータの分割
    X_train, X_valid = X.iloc[tr_idx], X.iloc[va_idx]
    y_train, y_valid = y.iloc[tr_idx], y.iloc[va_idx]

    print(f"fold:{i} start")
    # モデルの学習
    model_lgb, score_lgb, lgb_results = fit_lgbm(
        X_train, y_train,
        X_valid, y_valid, 
        params_lgb
    )

    
    
    # modelとscoreをリストに挿入
    scores.append(score_lgb)
    results.append(lgb_results)
    models.append(model_lgb)
    best_params[f'fold{i}'] = model_lgb.params

In [None]:
scores

In [None]:
# 出力
sample_submission

In [None]:
# アンサンブルする場合
for i, model in enumerate(models):
    if i == 0:
        pred = model.predict(test)
    else:
        pred += model.predict(test)
    

In [None]:
pred/len(models)

In [None]:
# アンサンブルしない場合
# pred = model_lgb.predict(test)
# sample_submission['Transported'] = np.where(pred>=0.5, 1, 0)
# sample_submission['Transported'] = sample_submission['Transported'].astype(bool)
# sample_submission

# アンサンブルする場合
# pred(1である確率)を全て足し、モデル数で割って平均を求める
for i, model in enumerate(models):
    if i == 0:
        pred = model.predict(test)
    else:
        pred += model.predict(test)
pred = pred / len(models)
sample_submission['Transported'] = np.where(pred>=0.5, 1, 0)
sample_submission['Transported'] = sample_submission['Transported'].astype(bool)
sample_submission

In [None]:
sample_submission.to_csv('submission.csv', encoding='utf_8_sig', index=False)