In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, plot_roc_curve, plot_confusion_matrix

In [None]:
# データ読み込み
# フォルダ名からコンペ名を取得
compe_name = os.listdir('/kaggle/input')[0]
print("Competition name: " + compe_name)

# データ読み込み
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
print("Train: ", train.shape)
print("Test: ", test.shape)

# 提出用ファイルの見本からIndexとObjectiveの列名を取得
sub_sample = pd.read_csv('../input/%s/sample_submission.csv' % (compe_name))
sub_idxcol = sub_sample.columns[0]
sub_objcol = sub_sample.columns[1]

print("###############################################################")
print("Variables: ", train.columns)
print("Submission format: ")
print(sub_sample)
print("Index column name: " + sub_idxcol)
print("Objective column name: " + sub_objcol)

In [None]:
print(train.info())
print(test.info())

In [None]:
#データの概要を確認した。
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# Survived(生死)
DICT_SURVIVED = {0: '0: Dead', 1: '1: Survived'}

def arrange_bar(ax, sr):
    ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=30, horizontalalignment="center")
    ax.grid(axis='y', linestyle='dotted')
    [ax.text(i, count, count, horizontalalignment='center') for i, count in enumerate(sr)]

sr_survived = train['Survived'].value_counts().rename(DICT_SURVIVED)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 3))
fig.subplots_adjust(wspace=0.5, hspace=0.5)
sr_survived.plot.pie(autopct="%1.1f%%", ax=axes[0])
sr_survived.plot.bar(ax=axes[1])

arrange_bar(axes[1], sr_survived)

plt.show()

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# グラフ出力する関数を定義
def arrange_stack_bar(ax):
    ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=30, horizontalalignment="center")
    ax.grid(axis='y', linestyle='dotted')

def output_bars(df, column, index={}):    
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
    fig.subplots_adjust(wspace=0.5, hspace=0.5)    

    # Key-Valueラベルなしの場合
    if len(index) == 0:
        df_vc = df.groupby([column])["Survived"].value_counts(
            sort=False).unstack().rename(columns=DICT_SURVIVED)
        df[column].value_counts().plot.pie(ax=axes[0, 0], autopct="%1.1f%%")
        df.groupby([column])["Survived"].value_counts(
            sort=False, normalize=True).unstack().rename(columns=DICT_SURVIVED).plot.bar(ax=axes[1, 1], stacked=True)

    # Key-Valueラベルありの場合
    else:
        df_vc = df.groupby([column])["Survived"].value_counts(
            sort=False).unstack().rename(index=index, columns=DICT_SURVIVED)
        df[column].value_counts().rename(index).plot.pie(ax=axes[0, 0], autopct="%1.1f%%")
        df.groupby([column])["Survived"].value_counts(
            sort=False, normalize=True).unstack().rename(index=index, columns=DICT_SURVIVED).plot.bar(ax=axes[1, 1], stacked=True)   

    df_vc.plot.bar(ax=axes[1, 0])

    for rect in axes[1, 0].patches:
        height = rect.get_height()

        # https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
        axes[1, 0].annotate('{:.0f}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom')

    df_vc.plot.bar(ax=axes[0, 1], stacked=True)

    arrange_stack_bar(axes[0, 1])
    arrange_stack_bar(axes[1, 0])
    arrange_stack_bar(axes[1, 1])

    # データラベル追加
    [axes[0, 1].text(i, item.sum(), item.sum(), horizontalalignment='center') 
     for i, (_, item) in enumerate(df_vc.iterrows())]

    plt.show()

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# Pclass(席等級) をグラフ出力
DICT_PCLASS = {1: '1: 1st(Upper)', 2: '2: 2nd(Middle)', 3: '3: 3rd(Lower)'}
output_bars(train, 'Pclass', DICT_PCLASS)

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# Sex(性) のグラフ出力。
output_bars(train, 'Sex')

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# Embarked(乗船港) のグラフ出力。
DICT_EMBARK = {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}
output_bars(train, 'Embarked', DICT_EMBARK)

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# 年齢のデータの欠損の扱い
# 欠損値の扱い: 除去されている
def output_box_hist(column, bins=20, query=None):
    if query == None:
        fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
    else:
        fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))
        train.query(query)[column].hist(ax=axes[2, 0], bins=bins)
        train.query(query).groupby('Survived')[column].plot.hist(
        ax=axes[2, 1], bins=bins, alpha=0.5, legend=True, grid=True)
        axes[2, 1].legend(labels=[DICT_SURVIVED[int(float((text.get_text())))] for text in axes[2, 1].get_legend().get_texts()])

    fig.subplots_adjust(wspace=0.5, hspace=0.5)

    train.boxplot(ax=axes[0, 0], column=[column])
    train.boxplot(ax=axes[0, 1], column=[column], by='Survived')
    axes[0, 1].set_xticklabels([DICT_SURVIVED[int(float(xticklabel.get_text()))] for xticklabel in axes[0, 1].get_xticklabels()])
    train[column].hist(ax=axes[1, 0], bins=bins)
    train.groupby('Survived')[column].plot.hist(ax=axes[1, 1], bins=bins, alpha=0.5, grid=True, legend=True)
    axes[1, 1].legend(labels=[DICT_SURVIVED[int(float((text.get_text())))] for text in axes[1, 1].get_legend().get_texts()])

    plt.show()

output_box_hist('Age')

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# SibSp(同乗した兄弟姉妹と配偶者数) グラフ出力
output_bars(train, 'SibSp')

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# Parch(同乗した両親子どもの数)グラフ出力
output_bars(train, 'Parch')

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# Fare(料金) グラフ出力。3行目は200以下のみを出力。
output_box_hist('Fare', 20, 'Fare < 200')

In [None]:
# 参考：Qiita - Kaggle初挑戦感想とタイタニック正答率81%の内容
# (https://qiita.com/FukuharaYohei/items/c87f61aee2a24466d5d4#35-pclass%E3%81%A8sex%E3%81%AE%E3%83%A9%E3%83%99%E3%83%AB%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%87%E3%82%A3%E3%83%B3%E3%82%B0)
# 相関行列出力
train.loc[:, ["Survived", "Pclass", "Age", "SibSp", "Parch", "Fare"]].corr().style.background_gradient(axis=None)

In [None]:
# 学習データ・テストデータの準備
obj_var = 'Survived' # train.csvの目的変数の列名を指定する

train_x = train.drop([obj_var], axis=1)
test_x = test.copy()
train_y = train[obj_var]

In [None]:
# 変数一覧
# 'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp','Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'

# 絶対に使用しない変数をリストアップ
drop_vars = ['PassengerId']

# 何らかの理由で使用しない変数をリストに追加
# drop_vars += ['Name', 'Sex', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
drop_vars += ['Name','Ticket', 'Cabin']

# 使用しない変数を削除
train_x.drop(drop_vars, axis=1, inplace=True)
test_x.drop(drop_vars, axis=1, inplace=True)
print(train_x)

In [None]:
# 欠損値の穴埋め
#欠損あり：Age,Ticket,Fare,Cabin,Embarked
#参考：[２]Pandasで欠損値処理　
# (https://qiita.com/0NE_shoT_/items/8db6d909e8b48adcb203)

#Age
Pcl1 = train_x.loc[train_x['Pclass']==1]['Age'].mean()
Pcl2 = train_x.loc[train_x['Pclass']==2]['Age'].mean()
Pcl3 = train_x.loc[train_x['Pclass']==3]['Age'].mean()

train_x.loc[(train_x['Age'].isnull())&(train_x['Pclass']==1),'Age'] = Pcl1
train_x.loc[(train_x['Age'].isnull())&(train_x['Pclass']==2),'Age'] = Pcl2
train_x.loc[(train_x['Age'].isnull())&(train_x['Pclass']==3),'Age'] = Pcl3

In [None]:
# #Fare
Fare1 = train_x.loc[train_x['Pclass']==1]['Fare'].median()
Fare2 = train_x.loc[train_x['Pclass']==2]['Fare'].median()
Fare3 = train_x.loc[train_x['Pclass']==3]['Fare'].median()

train_x.loc[(train_x['Fare'].isnull())&(train_x['Pclass']==1),'Fare'] = Fare1
train_x.loc[(train_x['Fare'].isnull())&(train_x['Pclass']==2),'Fare'] = Fare2
train_x.loc[(train_x['Fare'].isnull())&(train_x['Pclass']==3),'Fare'] = Fare3

In [None]:
# Embarked
train_x['Embarked'].fillna(train_x['Embarked'].mode())

In [None]:
# -----------------------------------
# カテゴリー変数処理
# -----------------------------------


In [None]:
# cat_cols = ['Embarked','Sex']
# # -----------------------------------
# # one-hot encoding
# # -----------------------------------
# # データの読み込み
# # train_x, test_x = load_data()
# # -----------------------------------

# # 学習データとテストデータを結合してget_dummiesによるone-hot encodingを行う
# all_x = pd.concat([train_x, test_x])
# all_x = pd.get_dummies(all_x, columns=cat_cols)

# # 学習データとテストデータに再分割
# train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
# test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

In [None]:
# # -----------------------------------
# # カテゴリー変数
# # -----------------------------------
# from sklearn.preprocessing import LabelEncoder

# for c in ['Pclass']:
#     # 学習データに基づいてどう変換するかを定める
#     le = LabelEncoder()
#     le.fit(train_x[c].fillna('NA'))

#     # 学習データ、テストデータを変換する
#     train_x[c] = le.transform(train_x[c].fillna('NA'))
#     test_x[c] = le.transform(test_x[c].fillna('NA'))
    

In [None]:
# -----------------------------------
# target encoding
# -----------------------------------
# -----------------------------------
from sklearn.model_selection import KFold
cat_cols = ['Pclass','Embarked','Sex'] 
# 変数をループしてtarget encoding
for c in cat_cols:
    # 学習データ全体で各カテゴリにおけるtargetの平均を計算
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    # テストデータのカテゴリを置換
    test_x[c] = test_x[c].map(target_mean)

    # 学習データの変換後の値を格納する配列を準備
    tmp = np.repeat(np.nan, train_x.shape[0])

    # 学習データを分割
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    for idx_1, idx_2 in kf.split(train_x):
        # out-of-foldで各カテゴリにおける目的変数の平均を計算
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        # 変換後の値を一時配列に格納
        tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)

    # 変換後のデータで元の変数を置換
    train_x[c] = tmp

In [None]:
# -----------------------------------
# 標準化
# -----------------------------------
# -----------------------------------
from sklearn.preprocessing import StandardScaler
num_cols = ['Age', 'Fare']
# 学習データに基づいて複数列の標準化を定義
scaler = StandardScaler()
scaler.fit(train_x[num_cols])

# 変換後のデータで各列を置換
train_x[num_cols] = scaler.transform(train_x[num_cols])
test_x[num_cols] = scaler.transform(test_x[num_cols])


In [None]:
print(train_x.info())
print(test_x.info())

In [None]:
# # PCA
# from sklearn.decomposition import PCA

# # データは標準化などのスケールを揃える前処理が行われているものとする

# # 学習データに基づいてPCAによる変換を定義
# pca = PCA(n_components=5)
# pca.fit(train_x)

# # 変換の適用
# train_x = pca.transform(train_x)
# test_x = pca.transform(test_x)

In [None]:
# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [None]:
import xgboost as xgb
from sklearn.metrics import log_loss

# 特徴量と目的変数をxgboostのデータ構造に変換する
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x)

# ハイパーパラメータの設定
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
num_round = 50

# 学習の実行
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
# watchlistには学習データおよびバリデーションデータをセットする
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist)

# バリデーションデータでのスコアの確認
va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

# 予測（二値の予測値ではなく、1である確率を出力するようにしている）
pred = model.predict(dtest)

In [None]:
# XGBoostモデルの学習
from xgboost import XGBClassifier

model = XGBClassifier(random_state=1)
model.fit(train_x, train_y)

In [None]:
# 学習済みモデルによる予測
from sklearn.metrics import accuracy_score

ptrn = model.predict_proba(train_x)
ptst = model.predict_proba(test_x)

ltrn = np.argmax(ptrn, axis=1)
ltst = np.argmax(ptst, axis=1)

print(ptrn)
print(ltrn)
print(train_y)

acctrn = accuracy_score(train_y, ltrn)
print("XGBoost training accuracy: %f" % (acctrn))

In [None]:
# 提出用ファイルを作成
csvname = 'submission_xgboost.csv'
print(csvname)
submission = pd.DataFrame({sub_idxcol: sub_sample[sub_idxcol], sub_objcol: ltst})
submission.to_csv(csvname, index=False)
print(submission)