# ベースラインの作成

In [3]:
import numpy as np
import pandas as pd
import os
import pickle
import gc
# import pandas_profiling as pdg # 分布確認
# from ydata_profiling import ProfileReport # 分布確認
import matplotlib.pyplot as plt # 可視化
# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
# モデリング
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')


In [4]:
# matplotlibで日本語表示したい場合はコレをinstall
! pip install japanize-matplotlib
import japanize_matplotlib
%matplotlib inline

Collecting japanize-matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... [?25l[?25hdone
  Created wheel for japanize-matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120257 sha256=44278922928f3a70a8ce7afc9f0e36ecdcb7f4ca5ea24f29568ce332d76ce491
  Stored in directory: /root/.cache/pip/wheels/61/7a/6b/df1f79be9c59862525070e157e62b08eab8ece27c1b68fbb94
Successfully built japanize-matplotlib
Installing collected packages: japanize-matplotlib
Successfully installed japanize-matplotlib-1.1.3


In [5]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
print(df_train.shape)
print('レコード数：',len(df_train))
print('カラム数： ', len(df_train.columns))

(891, 12)
レコード数： 891
カラム数：  12


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### データ型の変換

In [8]:
print('-'*10, 'object型に変換', '-'*10)
df_train['Pclass'] = df_train['Pclass'].astype(object)
print(df_train[['Pclass']].info())

print('-'*10, 'もとの型に変換', '-'*10)
df_train['Pclass'] = df_train['Pclass'].astype(int)
print(df_train[['Pclass']].info())




---------- object型に変換 ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Pclass  891 non-null    object
dtypes: object(1)
memory usage: 7.1+ KB
None
---------- もとの型に変換 ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Pclass  891 non-null    int64
dtypes: int64(1)
memory usage: 7.1 KB
None


#### 欠損値の確認

In [9]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### データセットの作成
目的変数と説明変数を作成

例）
目的変数：Survived
説明変数：Pclass, Fare(ともに欠損値がなく、数値データだから)



In [10]:
X_train, y_train, id_train = df_train[['Pclass', 'Fare']], df_train[['Survived']], df_train[['PassengerId']]

print(X_train.shape, y_train.shape, id_train.shape)

(891, 2) (891, 1) (891, 1)


#### バリデーション設計
作成するモデルの精度を手元のデータで判断すること
#### ■ホールドアウト検証
-> trainデータを「学習データ」と「検証データ」に分割する
【メリット】
- モデル学習一回のみ

【デメリット】
- 学習に使わないデータが生じる
- 検証データの選び方によって評価値が偏る可能性がある

In [16]:
# va:validationの略
X_tr, X_va, y_tr, y_va = train_test_split(X_train,
                                          y_train,
                                          test_size=0.2,
                                          shuffle=True,
                                          stratify=y_train,
                                          random_state=42)

print(X_tr.shape, y_tr.shape)
print(X_va.shape, y_va.shape)
print(f"y_train: {y_train['Survived'].mean():.3f}")
print(f"y_tr: {y_tr['Survived'].mean():.3f}")
print(f"y_va: {y_va['Survived'].mean():.3f}")

(712, 2) (712, 1)
(179, 2) (179, 1)
y_train: 0.384
y_tr: 0.383
y_va: 0.385


#### クロスバリデーション(交差検証)
-> trainデータから、複数の「学習データ」と「検証データ」の組を作成する
【メリット】
- すべてのデータを学習に利用できる
- 検証データの選び型による偏りを抑制できる
【デメリット】
- モデル学習を複数回行う必要がある

In [19]:
# 5fold クロスバリデーション
n_splits = 5
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42).split(X_train, y_train))
print(cv)

for nfold in np.arange(n_splits):
    print('-'*20, nfold, '-'*20)
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    X_tr, y_tr = X_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
    X_va, y_va = X_train.loc[idx_va, :], y_train.loc[idx_va, :]
    print(X_tr.shape, y_tr.shape)
    print(X_va.shape, y_va.shape)
    print(f"y_train: {y_train['Survived'].mean():.3f}")
    print(f"y_tr: {y_tr['Survived'].mean():.3f}")
    print(f"y_va: {y_va['Survived'].mean():.3f}")

[(array([  0,   1,   2,   3,   5,   7,   8,   9,  10,  11,  12,  14,  15,
        16,  17,  18,  20,  21,  22,  23,  25,  27,  29,  32,  33,  35,
        36,  37,  38,  39,  40,  41,  42,  44,  46,  47,  48,  49,  50,
        51,  52,  54,  56,  57,  58,  59,  61,  62,  65,  67,  69,  70,
        71,  72,  73,  74,  76,  77,  78,  80,  81,  82,  83,  84,  85,
        86,  87,  88,  89,  91,  92,  93,  95,  96,  97,  98, 100, 101,
       102, 104, 106, 107, 109, 110, 111, 112, 115, 116, 117, 119, 120,
       121, 122, 123, 124, 125, 127, 129, 130, 132, 133, 135, 136, 137,
       138, 139, 140, 142, 144, 145, 149, 151, 152, 154, 156, 157, 158,
       159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 171, 172, 175,
       178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
       191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 204,
       205, 206, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,
       219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 