In [1]:
!pwd
%matplotlib inline
%reload_ext autoreload
%autoreload 2
import sys
sys.executable

/81908/jupyter_notebook/Dockerfiles/pycaret/xfeat/xfeat_titanic


'/usr/local/bin/python3.7'

# タイタニックのデータを使って xfeat で特徴量作成 + lightGBM で学習をしてみた
- https://megane-man666.hatenablog.com/entry/xfeat
- https://github.com/KJMAN678/xfeat_titanic/blob/master/titanic_xfeat.ipynb

<br>

## xfeat: pnfが作った特徴量エンジニアリングのライブラリ
- データフレームから特徴量を作成するための各種エンコーダーを実装しています。cuDF を使うことでエンコーダーによっては 10~30 倍の高速化が可能となります。
- Code: https://t.co/IbqRET9YA2 
- Slides: https://t.co/8CY0IdCuJM pic.twitter.com/xM5HxRMQtj

<br>

## xfeat触ってみた感想
- カテゴリ列のラベルエンコーディング や 列同士を足し算、掛け算して新しい列作るとかがsklearnのpipelineのようにできるのはいい
- cuDFのGPU使った高速化は試せてない
- GBDTやOptuna用のAPIもあるが試せてない

In [2]:
import pandas as pd
import numpy as np
import os

from IPython.display import display

import xfeat
from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, ArithmeticCombinations,  LambdaEncoder

import pickle

from sklearn.model_selection import train_test_split

# lightGBM
import lightgbm as lgb

# 平均平方根誤差 (MSE)
from sklearn.metrics import mean_squared_error

import seaborn as sns
df = sns.load_dataset("titanic")
print(df.shape)
df_train, df_test = train_test_split(df, test_size=0.3)
print(df_train.shape, df_test.shape)
df.head()

(891, 15)
(623, 15) (268, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# reset_index()しないとto_feather()エラーになる
df_train = df_train.reset_index()
df_test = df_test.reset_index()

# データフレームをfeather形式に変換
# feather形式は、C++で実装されており、読み込みが非常に高速なファイルらしい
path = os.getcwd() + "/"
xfeat.utils.compress_df(df_train).to_feather(path + "train" + ".ftr")
xfeat.utils.compress_df(df_test).to_feather(path + "test" + ".ftr")

In [4]:
# feather形式のデータを読込
dtrain = pd.read_feather("./train.ftr")
dtest = pd.read_feather("./test.ftr")

display(dtrain.head(3))
display(dtrain.tail(3))
display(dtrain.dtypes)
display(dtest.head(3))
display(dtest.tail(3))
display(dtest.dtypes)

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,171,0,3,male,4.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
1,793,0,1,male,,0,0,30.695801,C,First,man,True,,Cherbourg,no,True
2,292,0,2,male,36.0,0,0,12.875,C,Second,man,True,D,Cherbourg,no,True


Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
620,194,1,1,female,44.0,0,0,27.7208,C,First,woman,False,B,Cherbourg,yes,True
621,495,0,3,male,,0,0,14.4583,C,Third,man,True,,Cherbourg,no,True
622,882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True


index             int16
survived           int8
pclass             int8
sex              object
age             float32
sibsp              int8
parch              int8
fare            float32
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,835,1,1,female,39.0,1,1,83.158302,C,First,woman,False,E,Cherbourg,yes,False
1,343,0,2,male,25.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
2,707,1,1,male,42.0,0,0,26.2875,S,First,man,True,E,Southampton,yes,True


Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
265,805,0,3,male,31.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
266,697,1,3,female,,0,0,7.7333,Q,Third,woman,False,,Queenstown,yes,True
267,574,0,3,male,16.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


index             int16
survived           int8
pclass             int8
sex              object
age             float32
sibsp              int8
parch              int8
fare            float32
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

## xfeat で実装されているラベルエンコーディング etc...

In [5]:
# カテゴリデータ（object）のみ表示　SelectCategorical().fit_transform()
display(SelectCategorical().fit_transform(dtrain).head())
display(SelectCategorical().fit_transform(dtest).head())

Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive
0,male,Q,Third,child,,Queenstown,no
1,male,C,First,man,,Cherbourg,no
2,male,C,Second,man,D,Cherbourg,no
3,male,C,Third,child,,Cherbourg,no
4,female,S,Second,woman,,Southampton,yes


Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive
0,female,C,First,woman,E,Cherbourg,yes
1,male,S,Second,man,,Southampton,no
2,male,S,First,man,E,Southampton,yes
3,male,Q,First,man,C,Queenstown,no
4,male,S,Second,child,,Southampton,yes


In [6]:
# ラベルエンコーディング

# Takes categorical columns from the data frame and performs label encoding on them.
# The converted data is stored in the column with suffix defined in `output_suffix`.
# By defining `output_suffix=""`, it is possible to store the result in the same column.

# データフレームからカテゴリ型の列を取り出し、ラベルエンコーディングを行います。
# 変換されたデータは `output_suffix` で定義されたサフィックスを持つ列に格納されます。
# output_suffix=""`を定義することで、同じカラムに結果を格納することができます。

encoder1 = Pipeline([
    # 除外するカラムをexclude_colsに指定
    SelectCategorical(exclude_cols=["alive"]),
    # ラベルエンコーディング
    LabelEncoder(output_suffix=""),
])

_dtrain = encoder1.fit_transform(dtrain).head()
_dtest = encoder1.fit_transform(dtest).head()
display(_dtrain)
display(_dtest)

Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,-1,0
1,0,1,1,1,-1,1
2,0,1,2,1,0,1
3,0,1,0,0,-1,1
4,1,2,2,2,-1,2


Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,0,0
1,1,1,1,1,-1,1
2,1,1,0,1,0,1
3,1,2,0,1,1,2
4,1,1,1,2,-1,1


In [7]:
encoder2 = Pipeline([
    # SelectCategorical() カテゴリカルデータ(object)のみをラベルエンコーディング
    # 除外するカラムをexclude_colsに指定
    SelectCategorical(exclude_cols=["alive"]),
    LabelEncoder(output_suffix=""),

    # If there are many categorical columns,
    # users can specify the columns to be combined with `input_cols` kwargs.
    # `r=2` specifies the number of columns to combine the columns.
    
    # カテゴリ列が多数存在する場合.
    # ユーザーは `input_cols` kwargs で結合するカラムを指定することができます。
    # `r=2` は結合するカラムの数を指定します。
    # ConcatCombination() 複数のカテゴリカルデータを組み合わせてラベルエンコーディング
    # 4つの項目から2つを選ぶ場合は　4C2 = 6　6通りが出力される
    ConcatCombination(drop_origin=True, output_suffix="", r=2),
])

display(encoder2.fit_transform(dtrain).head())
display(encoder2.fit_transform(dtest).head())

Unnamed: 0,sexembarked,sexclass,sexwho,sexdeck,sexembark_town,embarkedclass,embarkedwho,embarkeddeck,embarkedembark_town,classwho,classdeck,classembark_town,whodeck,whoembark_town,deckembark_town
0,0,0,0,-1,0,0,0,-1,0,0,-1,0,-1,0,-1
1,1,1,1,-1,1,2,2,0,2,2,0,2,0,2,0
2,1,2,1,0,1,3,2,1,2,3,2,3,1,2,1
3,1,0,0,-1,1,1,1,0,2,0,-1,1,-1,1,0
4,3,3,3,0,3,4,4,1,4,4,1,4,1,4,1


Unnamed: 0,sexembarked,sexclass,sexwho,sexdeck,sexembark_town,embarkedclass,embarkedwho,embarkeddeck,embarkedembark_town,classwho,classdeck,classembark_town,whodeck,whoembark_town,deckembark_town
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,2,2,0,2,2,2,0,2,2,0,2,0,2,0
2,2,1,2,1,2,1,2,1,2,1,0,1,1,2,1
3,3,1,2,2,3,2,3,3,4,1,1,2,2,3,3
4,2,2,3,0,2,2,3,0,2,3,0,2,1,3,0


In [8]:
# SelectNumerical extracts only the column of numerical data from the input dataframe.
# 入力データフレームから数値データの列のみを抽出します

# exclude_cols で出力しないカラムを指定

encoder3_1 = Pipeline([
    # 除外するカラムをexclude_colsに指定
    SelectNumerical(exclude_cols=["index", "survived"]),
    LabelEncoder(output_suffix=""),
])

encoder3_2 = Pipeline([
    # 除外するカラムをexclude_colsに指定
    SelectNumerical(exclude_cols=["index"]),
    LabelEncoder(output_suffix=""),
])

display(encoder3_1.fit_transform(dtrain).head())
# test.csvは Survived がないのでencoder3_1を適用するとエラーが発生する
display(encoder3_2.fit_transform(dtest).head())

Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone
0,0,0,0,0,0,0,0
1,1,-1,1,1,1,1,1
2,2,1,1,1,2,1,1
3,0,2,1,1,3,0,1
4,2,3,1,0,4,0,0


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1
2,0,0,2,1,1,2,1,1
3,1,0,3,2,1,3,1,0
4,0,1,4,0,0,4,0,0


In [9]:
# xfeat.ArithmeticCombinations　creates new columns by applying arithmetic combinations.
# 算術的な組み合わせを適用して新しい列を作成します。

encoder4_1 = Pipeline([
    SelectNumerical(exclude_cols=["index", "survived"]),
    # ArithmeticCombinations() 複数の数値データを加算して新しい項目を作る
    ArithmeticCombinations(
        drop_origin=True,
        operator="+",
        r=2,
        output_suffix="",
    ),
])

encoder4_2 = Pipeline([
    SelectNumerical(exclude_cols=["index"]),
    ArithmeticCombinations(
        drop_origin=True,
        operator="+",
        r=2,
        output_suffix="",
    ),
])

display(encoder4_1.fit_transform(dtrain).head())
display(encoder4_2.fit_transform(dtest).head())


evaluating in Python space because the '+' operator is not supported by numexpr for the bool dtype, use '|' instead



Unnamed: 0,pclassage,pclasssibsp,pclassparch,pclassfare,pclassadult_male,pclassalone,agesibsp,ageparch,agefare,ageadult_male,...,sibspparch,sibspfare,sibspadult_male,sibspalone,parchfare,parchadult_male,parchalone,fareadult_male,farealone,adult_malealone
0,7.0,7,4,32.125,3,3,8.0,5.0,33.125,4.0,...,5,33.125,4,4,30.125,1,1,29.125,29.125,False
1,,1,1,31.695801,2,2,,,,,...,0,30.695801,1,1,30.695801,1,1,31.695801,31.695801,True
2,38.0,2,2,14.875,3,3,36.0,36.0,48.875,37.0,...,0,12.875,1,1,12.875,1,1,13.875,13.875,True
3,14.0,3,3,21.7875,3,4,11.0,11.0,29.7875,11.0,...,0,18.7875,0,1,18.7875,0,1,18.7875,19.7875,True
4,36.0,2,3,25.0,2,2,34.0,35.0,57.0,34.0,...,1,23.0,0,0,24.0,1,1,23.0,23.0,False



evaluating in Python space because the '+' operator is not supported by numexpr for the bool dtype, use '|' instead



Unnamed: 0,survivedpclass,survivedage,survivedsibsp,survivedparch,survivedfare,survivedadult_male,survivedalone,pclassage,pclasssibsp,pclassparch,...,sibspparch,sibspfare,sibspadult_male,sibspalone,parchfare,parchadult_male,parchalone,fareadult_male,farealone,adult_malealone
0,2,40.0,2,2,84.158302,1,1,40.0,2,2,...,2,84.158302,1,1,84.158302,1,1,83.158302,83.158302,False
1,2,25.0,0,0,13.0,1,1,27.0,2,2,...,0,13.0,1,1,13.0,1,1,14.0,14.0,True
2,2,43.0,1,1,27.2875,2,2,43.0,1,1,...,0,26.2875,1,1,26.2875,1,1,27.2875,27.2875,True
3,1,44.0,2,0,90.0,1,0,45.0,3,1,...,2,92.0,3,2,90.0,1,0,91.0,90.0,True
4,3,1.83,2,2,19.75,1,1,2.83,3,3,...,2,19.75,1,1,19.75,1,1,18.75,18.75,False


In [10]:
# Serialize/Deserialize
# The parameters of the encoder can be serialized/deserialized by pickle.
# エンコーダのパラメータをPickleでシリアライズ/デシリアライズすることができます。

encoder7 = Pipeline([
    SelectCategorical(exclude_cols=["alive"]),
    LabelEncoder(output_suffix=""),
])

dtrain_encoded = encoder7.fit_transform(dtrain)

# 書き込み
with open("label_train.pkl", "wb") as f:
    pickle.dump(encoder7, f)
    
dtrain_encoded.head()

Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,-1,0
1,0,1,1,1,-1,1
2,0,1,2,1,0,1
3,0,1,0,0,-1,1
4,1,2,2,2,-1,2


In [11]:
# 読込み
with open("label_train.pkl", "rb") as f:
    encoder7 = pickle.load(f)

encoder7.transform(dtrain).head()

Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,-1,0
1,0,1,1,1,-1,1
2,0,1,2,1,0,1
3,0,1,0,0,-1,1
4,1,2,2,2,-1,2


In [12]:
encoder8 = Pipeline([
    SelectCategorical(exclude_cols=["alive"]),
    LabelEncoder(output_suffix=""),
])

dtest_encoded2 = encoder8.fit_transform(dtest)

# 書き込み
with open("label_test.pkl", "wb") as f:
    pickle.dump(encoder8, f)
    
dtest_encoded2.head()

Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,0,0
1,1,1,1,1,-1,1
2,1,1,0,1,0,1
3,1,2,0,1,1,2
4,1,1,1,2,-1,1


In [13]:
# 読込み
with open("label_test.pkl", "rb") as f:
    encoder8 = pickle.load(f)

encoder8.transform(dtest).head()

Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,0,0
1,1,1,1,1,-1,1
2,1,1,0,1,0,1
3,1,2,0,1,1,2
4,1,1,1,2,-1,1


In [14]:
# SelectCategorical
dtrain_en1 = encoder1.fit_transform(dtrain)
dtest_en1 = encoder1.fit_transform(dtest)

# SelectCategorical + ConcatCombination
dtrain_en2 = encoder2.fit_transform(dtrain)
dtest_en2 = encoder2.fit_transform(dtest)

# SelectNumerical
dtrain_en3_1 = encoder3_1.fit_transform(dtrain)
dtest_en3_2 = encoder3_2.fit_transform(dtest)

# SelectNumerical + ArithmeticCombinations
dtrain_en4_1 = encoder4_1.fit_transform(dtrain)
dtest_en4_2 = encoder4_2.fit_transform(dtest)

display(dtrain_en1.head())
display(dtest_en1.head())

display(dtrain_en2.head())
display(dtest_en2.head())

display(dtrain_en3_1.head())
display(dtest_en3_2.head())

display(dtrain_en4_1.head())
display(dtest_en4_2.head())


evaluating in Python space because the '+' operator is not supported by numexpr for the bool dtype, use '|' instead



Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,-1,0
1,0,1,1,1,-1,1
2,0,1,2,1,0,1
3,0,1,0,0,-1,1
4,1,2,2,2,-1,2


Unnamed: 0,sex,embarked,class,who,deck,embark_town
0,0,0,0,0,0,0
1,1,1,1,1,-1,1
2,1,1,0,1,0,1
3,1,2,0,1,1,2
4,1,1,1,2,-1,1


Unnamed: 0,sexembarked,sexclass,sexwho,sexdeck,sexembark_town,embarkedclass,embarkedwho,embarkeddeck,embarkedembark_town,classwho,classdeck,classembark_town,whodeck,whoembark_town,deckembark_town
0,0,0,0,-1,0,0,0,-1,0,0,-1,0,-1,0,-1
1,1,1,1,-1,1,2,2,0,2,2,0,2,0,2,0
2,1,2,1,0,1,3,2,1,2,3,2,3,1,2,1
3,1,0,0,-1,1,1,1,0,2,0,-1,1,-1,1,0
4,3,3,3,0,3,4,4,1,4,4,1,4,1,4,1


Unnamed: 0,sexembarked,sexclass,sexwho,sexdeck,sexembark_town,embarkedclass,embarkedwho,embarkeddeck,embarkedembark_town,classwho,classdeck,classembark_town,whodeck,whoembark_town,deckembark_town
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,2,2,0,2,2,2,0,2,2,0,2,0,2,0
2,2,1,2,1,2,1,2,1,2,1,0,1,1,2,1
3,3,1,2,2,3,2,3,3,4,1,1,2,2,3,3
4,2,2,3,0,2,2,3,0,2,3,0,2,1,3,0


Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone
0,0,0,0,0,0,0,0
1,1,-1,1,1,1,1,1
2,2,1,1,1,2,1,1
3,0,2,1,1,3,0,1
4,2,3,1,0,4,0,0


Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1
2,0,0,2,1,1,2,1,1
3,1,0,3,2,1,3,1,0
4,0,1,4,0,0,4,0,0


Unnamed: 0,pclassage,pclasssibsp,pclassparch,pclassfare,pclassadult_male,pclassalone,agesibsp,ageparch,agefare,ageadult_male,...,sibspparch,sibspfare,sibspadult_male,sibspalone,parchfare,parchadult_male,parchalone,fareadult_male,farealone,adult_malealone
0,7.0,7,4,32.125,3,3,8.0,5.0,33.125,4.0,...,5,33.125,4,4,30.125,1,1,29.125,29.125,False
1,,1,1,31.695801,2,2,,,,,...,0,30.695801,1,1,30.695801,1,1,31.695801,31.695801,True
2,38.0,2,2,14.875,3,3,36.0,36.0,48.875,37.0,...,0,12.875,1,1,12.875,1,1,13.875,13.875,True
3,14.0,3,3,21.7875,3,4,11.0,11.0,29.7875,11.0,...,0,18.7875,0,1,18.7875,0,1,18.7875,19.7875,True
4,36.0,2,3,25.0,2,2,34.0,35.0,57.0,34.0,...,1,23.0,0,0,24.0,1,1,23.0,23.0,False


Unnamed: 0,survivedpclass,survivedage,survivedsibsp,survivedparch,survivedfare,survivedadult_male,survivedalone,pclassage,pclasssibsp,pclassparch,...,sibspparch,sibspfare,sibspadult_male,sibspalone,parchfare,parchadult_male,parchalone,fareadult_male,farealone,adult_malealone
0,2,40.0,2,2,84.158302,1,1,40.0,2,2,...,2,84.158302,1,1,84.158302,1,1,83.158302,83.158302,False
1,2,25.0,0,0,13.0,1,1,27.0,2,2,...,0,13.0,1,1,13.0,1,1,14.0,14.0,True
2,2,43.0,1,1,27.2875,2,2,43.0,1,1,...,0,26.2875,1,1,26.2875,1,1,27.2875,27.2875,True
3,1,44.0,2,0,90.0,1,0,45.0,3,1,...,2,92.0,3,2,90.0,1,0,91.0,90.0,True
4,3,1.83,2,2,19.75,1,1,2.83,3,3,...,2,19.75,1,1,19.75,1,1,18.75,18.75,False


## データ結合

In [15]:
train = pd.concat([dtrain_en1, dtrain_en2], axis=1)
train = pd.concat([train, dtrain_en3_1], axis=1)
train = pd.concat([train, dtrain_en4_1], axis=1)
train = pd.concat([train, dtrain["survived"]], axis=1)

test = pd.concat([dtest_en1, dtest_en2], axis=1)
test = pd.concat([test, dtest_en3_2], axis=1)
test = pd.concat([test, dtest_en4_2], axis=1)

display(train.head())
display(train.shape)

display(test.head())
display(test.shape)

Unnamed: 0,sex,embarked,class,who,deck,embark_town,sexembarked,sexclass,sexwho,sexdeck,...,sibspfare,sibspadult_male,sibspalone,parchfare,parchadult_male,parchalone,fareadult_male,farealone,adult_malealone,survived
0,0,0,0,0,-1,0,0,0,0,-1,...,33.125,4,4,30.125,1,1,29.125,29.125,False,0
1,0,1,1,1,-1,1,1,1,1,-1,...,30.695801,1,1,30.695801,1,1,31.695801,31.695801,True,0
2,0,1,2,1,0,1,1,2,1,0,...,12.875,1,1,12.875,1,1,13.875,13.875,True,0
3,0,1,0,0,-1,1,1,0,0,-1,...,18.7875,0,1,18.7875,0,1,18.7875,19.7875,True,0
4,1,2,2,2,-1,2,3,3,3,0,...,23.0,0,0,24.0,1,1,23.0,23.0,False,1


(623, 50)

Unnamed: 0,sex,embarked,class,who,deck,embark_town,sexembarked,sexclass,sexwho,sexdeck,...,sibspparch,sibspfare,sibspadult_male,sibspalone,parchfare,parchadult_male,parchalone,fareadult_male,farealone,adult_malealone
0,0,0,0,0,0,0,0,0,0,0,...,2,84.158302,1,1,84.158302,1,1,83.158302,83.158302,False
1,1,1,1,1,-1,1,2,2,2,0,...,0,13.0,1,1,13.0,1,1,14.0,14.0,True
2,1,1,0,1,0,1,2,1,2,1,...,0,26.2875,1,1,26.2875,1,1,27.2875,27.2875,True
3,1,2,0,1,1,2,3,1,2,2,...,2,92.0,3,2,90.0,1,0,91.0,90.0,True
4,1,1,1,2,-1,1,2,2,3,0,...,2,19.75,1,1,19.75,1,1,18.75,18.75,False


(268, 57)

## lightGBM

In [16]:
features = test.columns.to_list()
features = list(set(features) - 
                set(['survivedadult_male', 
                 'survivedalone', 
                 'survivedage', 
                 'survivedsibsp', 
                 'survivedpclass', 
                 'survivedparch', 
                 'survivedfare'])
               )
print(features)

['classdeck', 'parch', 'embarkedembark_town', 'agealone', 'fare', 'pclassparch', 'sex', 'alone', 'class', 'sibsp', 'sibspparch', 'embarked', 'sexembarked', 'sibspadult_male', 'ageadult_male', 'agefare', 'whoembark_town', 'pclassadult_male', 'pclassage', 'sexwho', 'whodeck', 'fareadult_male', 'embark_town', 'classwho', 'embarkedclass', 'parchfare', 'sexclass', 'age', 'sibspalone', 'parchalone', 'adult_malealone', 'ageparch', 'farealone', 'pclasssibsp', 'embarkeddeck', 'deckembark_town', 'classembark_town', 'sibspfare', 'pclassalone', 'deck', 'adult_male', 'embarkedwho', 'pclassfare', 'who', 'pclass', 'sexdeck', 'parchadult_male', 'survived', 'agesibsp', 'sexembark_town']


In [17]:
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'binary', # 二項分類
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75
}

# ホールドアウト法
X_train, X_test, y_train, y_test = train_test_split(train[features], 
                                                    train["survived"], 
                                                    random_state=42)

# lightGBM 用のデータに変形
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_test, y_test)

# モデルの作成
model_lgb = lgb.train(params, 
                      lgb_train, 
                      num_boost_round=2500, 
                      early_stopping_rounds = 50, 
                      valid_sets = [lgb_train, lgb_val], 
                      verbose_eval = 1)

[1]	training's rmse: 0.461017	valid_1's rmse: 0.457756
Training until validation scores don't improve for 50 rounds
[2]	training's rmse: 0.414909	valid_1's rmse: 0.41201
[3]	training's rmse: 0.373839	valid_1's rmse: 0.3713
[4]	training's rmse: 0.357723	valid_1's rmse: 0.358182
[5]	training's rmse: 0.322665	valid_1's rmse: 0.323367
[6]	training's rmse: 0.291238	valid_1's rmse: 0.292186
[7]	training's rmse: 0.26294	valid_1's rmse: 0.26397
[8]	training's rmse: 0.237477	valid_1's rmse: 0.238557
[9]	training's rmse: 0.214536	valid_1's rmse: 0.215639
[10]	training's rmse: 0.206102	valid_1's rmse: 0.209299
[11]	training's rmse: 0.199046	valid_1's rmse: 0.204439
[12]	training's rmse: 0.179907	valid_1's rmse: 0.185031
[13]	training's rmse: 0.174771	valid_1's rmse: 0.182043
[14]	training's rmse: 0.158061	valid_1's rmse: 0.165008
[15]	training's rmse: 0.153374	valid_1's rmse: 0.162853
[16]	training's rmse: 0.150016	valid_1's rmse: 0.162534
[17]	training's rmse: 0.135746	valid_1's rmse: 0.147522
[

In [18]:
# validationデータで予測値を作成
val_pred = model_lgb.predict(X_test)

# 誤差率を算定
val_score = np.sqrt(mean_squared_error(val_pred, y_test))
print(f'rmse score は {val_score}')

rmse score は 2.6046483934300115e-05


In [19]:
# テストデータで予測値を作成
pred = model_lgb.predict(test[features])
pred

array([1.60771327e-05, 9.99982371e-01, 7.01886230e-06, 9.99985481e-01,
       9.78737590e-05, 9.99984293e-01, 9.99993613e-01, 3.52908219e-05,
       8.10040288e-06, 9.99965793e-01, 3.10404288e-05, 9.99984841e-01,
       9.99986031e-01, 9.99991451e-01, 9.99977607e-01, 9.99989814e-01,
       8.03505178e-06, 3.46670784e-05, 9.99991886e-01, 1.89960207e-05,
       9.99977525e-01, 9.99973385e-01, 9.99972882e-01, 2.24881549e-05,
       9.99988233e-01, 9.99978617e-01, 1.88669296e-05, 2.46972242e-05,
       9.99979002e-01, 9.99972859e-01, 9.99992405e-01, 9.99967532e-01,
       9.99977690e-01, 2.61181018e-05, 9.99986304e-01, 1.50452447e-05,
       1.87552039e-05, 9.99986730e-01, 1.50181224e-05, 9.99981629e-01,
       2.76451805e-05, 9.99988158e-01, 1.31866164e-05, 2.50474579e-05,
       9.99979436e-01, 2.92654776e-05, 9.99973276e-01, 2.79465547e-05,
       9.99990719e-01, 9.99969619e-01, 9.99983591e-01, 3.94123261e-05,
       9.99984372e-01, 9.99979897e-01, 1.44592768e-05, 9.99975062e-01,
      

In [21]:
## gender_submission の Survived カラムを予測値に入れ替える
#gs["Survived"] = pred
#
## 0.6以上の場合は 1、0.6未満の場合は 0 に置換
#gs["Survived"].mask(gs["Survived"] >= 0.6, 1, inplace=True)
#gs["Survived"].mask(gs["Survived"] < 0.6, 0, inplace=True)
#
## csvへの出力
#gs.to_csv(path + "submission.csv", index=False)

## https://github.com/pfnet-research/xfeat

### Feature Engineering
- 変換してfeatherファイルに保存

In [22]:
import pandas as pd
from xfeat import Pipeline, SelectNumerical, ArithmeticCombinations

dtrain = pd.read_feather("./train.ftr")

# 2-order Arithmetic combinations.
Pipeline(
    [
        SelectNumerical(),
        ArithmeticCombinations(
            exclude_cols=["survived"], 
            drop_origin=True, 
            operator="+", 
            r=2,
        ),
    ]
).fit_transform(dtrain).reset_index(
    drop=True
).to_feather(
    "feature_arithmetic_combi2.ftr"
)


evaluating in Python space because the '+' operator is not supported by numexpr for the bool dtype, use '|' instead



### Target Encoding with cuDF/CuPy
- Target Encodingめっちゃ簡単にできる。すごい

In [26]:
from sklearn.model_selection import KFold
from xfeat import TargetEncoder

dtrain = pd.read_feather("./train.ftr")

fold = KFold(n_splits=5, shuffle=False)
encoder = TargetEncoder(input_cols=["embarked", "age", "pclass"], 
                        target_col="survived",
                        fold=fold)

# df = cudf.from_pandas(df)  # if cuDF is available.
df_encoded = encoder.fit_transform(dtrain)
df_encoded


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.



Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,embarked_te,age_te,pclass_te
0,171,0,3,male,4.0,4,1,29.125000,Q,Third,child,False,,Queenstown,no,False,0.363636,0.500000,0.237410
1,793,0,1,male,,0,0,30.695801,C,First,man,True,,Cherbourg,no,True,0.543478,0.287129,0.652174
2,292,0,2,male,36.0,0,0,12.875000,C,Second,man,True,D,Cherbourg,no,True,0.543478,0.444444,0.457143
3,731,0,3,male,11.0,0,0,18.787500,C,Third,child,False,,Cherbourg,no,True,0.543478,0.500000,0.237410
4,98,1,2,female,34.0,0,1,23.000000,S,Second,woman,False,,Southampton,yes,False,0.337950,0.333333,0.457143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,510,1,3,male,29.0,0,0,7.750000,Q,Third,man,True,,Queenstown,yes,True,0.413043,0.400000,0.242857
619,319,1,1,female,40.0,1,1,134.500000,C,First,woman,False,E,Cherbourg,yes,False,0.500000,0.142857,0.620690
620,194,1,1,female,44.0,0,0,27.720800,C,First,woman,False,B,Cherbourg,yes,True,0.500000,0.333333,0.620690
621,495,0,3,male,,0,0,14.458300,C,Third,man,True,,Cherbourg,no,True,0.500000,0.310680,0.242857


### Groupby features with cuDF
- xfeat.aggregation 与えられたキーでテーブルの行をグループ化した後に値を集計するヘルパー関数

In [34]:
from xfeat import aggregation

dtrain = pd.read_feather("./train.ftr")

# df = cudf.from_pandas(df)  # if cuDF is available.
df_agg = aggregation(dtrain,
                     group_key="embarked",
                     group_values=["fare", "age"],
                     agg_methods=["sum", "min", "max"]
                     )#.to_pandas()
df_agg[0]

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,...,deck,embark_town,alive,alone,agg_sum_fare_grpby_embarked,agg_sum_age_grpby_embarked,agg_min_fare_grpby_embarked,agg_min_age_grpby_embarked,agg_max_fare_grpby_embarked,agg_max_age_grpby_embarked
0,171,0,3,male,4.0,4,1,29.125000,Q,Third,...,,Queenstown,no,False,671.995972,584.000000,6.7500,2.00,90.000000,70.5
1,793,0,1,male,,0,0,30.695801,C,First,...,,Cherbourg,no,True,6452.462891,2520.419922,4.0125,0.42,512.329224,60.0
2,292,0,2,male,36.0,0,0,12.875000,C,Second,...,D,Cherbourg,no,True,6452.462891,2520.419922,4.0125,0.42,512.329224,60.0
3,731,0,3,male,11.0,0,0,18.787500,C,Third,...,,Cherbourg,no,True,6452.462891,2520.419922,4.0125,0.42,512.329224,60.0
4,98,1,2,female,34.0,0,1,23.000000,S,Second,...,,Southampton,yes,False,12341.853516,11251.419922,0.0000,0.67,263.000000,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,510,1,3,male,29.0,0,0,7.750000,Q,Third,...,,Queenstown,yes,True,671.995972,584.000000,6.7500,2.00,90.000000,70.5
619,319,1,1,female,40.0,1,1,134.500000,C,First,...,E,Cherbourg,yes,False,6452.462891,2520.419922,4.0125,0.42,512.329224,60.0
620,194,1,1,female,44.0,0,0,27.720800,C,First,...,B,Cherbourg,yes,True,6452.462891,2520.419922,4.0125,0.42,512.329224,60.0
621,495,0,3,male,,0,0,14.458300,C,Third,...,,Cherbourg,no,True,6452.462891,2520.419922,4.0125,0.42,512.329224,60.0


### Feature Selection with GBDT feature importance
- GBDT で feature importance計算して、閾値以上の特徴量のみ返す

In [47]:
from xfeat import GBDTFeatureSelector

dtrain = pd.read_feather("./train.ftr")
cols = dtrain.columns.to_list()
print(cols)
cols.remove("survived")

dtrain_cate = Pipeline([
    SelectCategorical(),
    LabelEncoder(output_suffix=""),
]).fit_transform(dtrain)

dtrain_num = Pipeline([
    SelectNumerical(),
    LabelEncoder(output_suffix=""),
]).fit_transform(dtrain)

dtrain = pd.concat([dtrain_cate, dtrain_num], axis=1)
display(dtrain)

params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    "objective": "binary",
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75
}

fit_kwargs = {
    "num_boost_round": 10,
}

selector = GBDTFeatureSelector(
    input_cols=cols,
    target_col="survived",
    threshold=0.5,
    lgbm_params=params,
    lgbm_fit_kwargs=fit_kwargs,
)
df_selected = selector.fit_transform(dtrain)
print("Selected columns:", selector._selected_cols)

['index', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']


Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive,index,survived,pclass,age,sibsp,parch,fare,adult_male,alone
0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,-1,1,0,1,0,1,-1,1,1,1,1,1
2,0,1,2,1,0,1,0,2,0,2,1,1,1,2,1,1
3,0,1,0,0,-1,1,0,3,0,0,2,1,1,3,0,1
4,1,2,2,2,-1,2,1,4,1,2,3,1,0,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,0,0,0,1,-1,0,1,618,1,0,20,1,1,14,1,1
619,1,1,1,2,5,1,1,619,1,1,44,3,0,212,0,0
620,1,1,1,2,2,1,1,620,1,1,58,1,1,142,0,1
621,0,1,0,1,-1,1,0,621,0,0,-1,1,1,184,1,1


Selected columns: ['index', 'fare', 'age', 'adult_male', 'deck', 'pclass', 'alive']


### Feature Selection with Optuna
- 選ばれた特徴量に対してハイパーパラメーターの最良の値を検索して目的を最大化できる

In [49]:
import optuna
from xfeat import GBDTFeatureExplorer
from functools import partial

dtrain = pd.read_feather("./train.ftr")
input_cols = dtrain.columns.to_list()
print(input_cols)
input_cols.remove("survived")

dtrain_cate = Pipeline([
    SelectCategorical(),
    LabelEncoder(output_suffix=""),
]).fit_transform(dtrain)

dtrain_num = Pipeline([
    SelectNumerical(),
    LabelEncoder(output_suffix=""),
]).fit_transform(dtrain)

dtrain = pd.concat([dtrain_cate, dtrain_num], axis=1)
display(dtrain)

LGBM_PARAMS = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    "objective": "binary",
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75
}

fit_params = {
    "num_boost_round": 10,
    "early_stopping_rounds": 50, 
    "verbose_eval": 1
}

def objective(df, selector, trial):
    selector.set_trial(trial)
    selector.fit(df)
    input_cols = selector.get_selected_cols()

    # Evaluate with selected columns
    train_set = lgb.Dataset(df[input_cols], 
                            label=df["survived"])
    scores = lgb.cv(LGBM_PARAMS, 
                    train_set, 
                    num_boost_round=100, 
                    stratified=False, seed=1)
    rmsle_score = scores["rmse-mean"][-1]
    return rmsle_score


selector = GBDTFeatureExplorer(
    input_cols=input_cols,
    target_col="survived",
    fit_once=True,
    threshold_range=(0.6, 1.0),
    lgbm_params=LGBM_PARAMS,
    lgbm_fit_kwargs=fit_params,
)

study = optuna.create_study(direction="minimize")
study.optimize(partial(objective, 
                       dtrain, 
                       selector), 
               n_trials=20)

selector.from_trial(study.best_trial)
print("Selected columns:", selector.get_selected_cols())

['index', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']


Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive,index,survived,pclass,age,sibsp,parch,fare,adult_male,alone
0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,-1,1,0,1,0,1,-1,1,1,1,1,1
2,0,1,2,1,0,1,0,2,0,2,1,1,1,2,1,1
3,0,1,0,0,-1,1,0,3,0,0,2,1,1,3,0,1
4,1,2,2,2,-1,2,1,4,1,2,3,1,0,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,0,0,0,1,-1,0,1,618,1,0,20,1,1,14,1,1
619,1,1,1,2,5,1,1,619,1,1,44,3,0,212,0,0
620,1,1,1,2,2,1,1,620,1,1,58,1,1,142,0,1
621,0,1,0,1,-1,1,0,621,0,0,-1,1,1,184,1,1


[W 2020-07-19 15:17:31,106] Setting status of trial#0 as TrialState.FAIL because of the following error: ValueError('For early stopping, at least one dataset and eval metric is required for evaluation')
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/optuna/study.py", line 734, in _run_trial
    result = func(trial)
  File "<ipython-input-49-524ba17dc17c>", line 43, in objective
    selector.fit(df)
  File "/usr/local/lib/python3.7/site-packages/xfeat/optuna_selector/_gbdt_feature_explorer.py", line 108, in fit
    super().fit(input_df)
  File "/usr/local/lib/python3.7/site-packages/xfeat/selector/_gbdt_selector.py", line 77, in fit
    self._lgbm_params, train_data, **self._lgbm_fit_kwargs
  File "/usr/local/lib/python3.7/site-packages/lightgbm/engine.py", line 264, in train
    evaluation_result_list=evaluation_result_list))
  File "/usr/local/lib/python3.7/site-packages/lightgbm/callback.py", line 221, in _callback
    _init(env)
  File "/usr/local/

ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

https://github.com/pfnet-research/xfeat/blob/master/examples/feature_selection_with_gbdt_and_optuna.py

In [50]:
"""
This example uses UCI ML California Housing dataset, which is a
regression dataset including 20k samples.

    Dua, D. and Graff, C. (2019). UCI Machine Learning Repository
    [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of
    Information and Computer Science.
"""
from functools import partial

from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna

from xfeat import ArithmeticCombinations, Pipeline
from xfeat import GBDTFeatureExplorer


def main():
    data = fetch_california_housing()
    df = pd.DataFrame(
        data=data.data,
        columns=data.feature_names)

    print("Before adding interaction features:")
    evaluate_dataframe(df, data.target)

    print("After adding interaction features:")
    df = feature_engineering(df)
    evaluate_dataframe(df, data.target)

    print("After applying GBDTFeatureSelector:")
    df = feature_selection(df, data.target)
    evaluate_dataframe(df, data.target)


def feature_engineering(df):
    cols = df.columns.tolist()

    encoder = Pipeline([
        ArithmeticCombinations(input_cols=cols,
                               drop_origin=False,
                               operator="+",
                               r=2,
                               output_suffix="_plus"),
        ArithmeticCombinations(input_cols=cols,
                               drop_origin=False,
                               operator="*",
                               r=2,
                               output_suffix="_mul"),
        ArithmeticCombinations(input_cols=cols,
                               drop_origin=False,
                               operator="-",
                               r=2,
                               output_suffix="_minus"),
        ArithmeticCombinations(input_cols=cols,
                               drop_origin=False,
                               operator="+",
                               r=3,
                               output_suffix="_plus"),
    ])
    return encoder.fit_transform(df)


def objective(df, selector, trial):
    selector.set_trial(trial)
    selector.fit(df)
    input_cols = selector.get_selected_cols()

    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.1,
        "verbosity": -1,
    }

    # Evaluate with selected columns
    train_set = lgb.Dataset(df[input_cols], label=df["target"])
    scores = lgb.cv(params, train_set, num_boost_round=100, stratified=False, seed=1)
    rmsle_score = scores["rmse-mean"][-1]
    return rmsle_score


def feature_selection(df, y):
    input_cols = df.columns.tolist()
    n_before_selection = len(input_cols)

    df["target"] = np.log1p(y)
    df_train, _ = train_test_split(df, test_size=0.5, random_state=1)

    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.1,
        "verbosity": -1,
    }
    fit_params = {
        "num_boost_round": 100,
    }
    selector = GBDTFeatureExplorer(input_cols=input_cols,
                                   target_col="target",
                                   fit_once=True,
                                   threshold_range=(0.6, 1.0),
                                   lgbm_params=params,
                                   lgbm_fit_kwargs=fit_params)

    study = optuna.create_study(direction="minimize")
    study.optimize(partial(objective, df_train, selector), n_trials=20)

    selector.from_trial(study.best_trial)
    selected_cols = selector.get_selected_cols()
    print(f" - {n_before_selection - len(selected_cols)} features are removed.")

    return df[selected_cols]


def evaluate_dataframe(df, y):
    X_train, X_test, y_train, y_test = train_test_split(df.values, y,
                                                        test_size=0.5,
                                                        random_state=1)
    y_train = np.log1p(y_train)

    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.1,
        "verbosity": -1,
    }
    train_set = lgb.Dataset(X_train, label=y_train)
    scores = lgb.cv(params, train_set, num_boost_round=100, stratified=False, seed=1)
    rmsle_score = scores["rmse-mean"][-1]
    print(f" - CV RMSEL: {rmsle_score:.6f}")

    booster = lgb.train(params, train_set, num_boost_round=100)
    y_pred = booster.predict(X_test)
    test_rmsle_score = rmse(np.log1p(y_test), y_pred)
    print(f" - test RMSEL: {test_rmsle_score:.6f}")


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


if __name__ == "__main__":
    main()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


Before adding interaction features:
 - CV RMSEL: 0.143823
 - test RMSEL: 0.140610
After adding interaction features:
 - CV RMSEL: 0.140239
 - test RMSEL: 0.137046
After applying GBDTFeatureSelector:


[I 2020-07-19 15:19:17,248] Finished trial#0 with value: 0.14004333870727037 with parameters: {'GBDTFeatureSelector.threshold': 0.729281394099013}. Best is trial#0 with value: 0.14004333870727037.
[I 2020-07-19 15:19:25,665] Finished trial#1 with value: 0.13979228856367798 with parameters: {'GBDTFeatureSelector.threshold': 0.7688554216403491}. Best is trial#1 with value: 0.13979228856367798.
[I 2020-07-19 15:19:35,179] Finished trial#2 with value: 0.13996029128482462 with parameters: {'GBDTFeatureSelector.threshold': 0.8790890553169068}. Best is trial#1 with value: 0.13979228856367798.
[I 2020-07-19 15:19:44,527] Finished trial#3 with value: 0.13946556227894677 with parameters: {'GBDTFeatureSelector.threshold': 0.8543602184016633}. Best is trial#3 with value: 0.13946556227894677.
[I 2020-07-19 15:19:54,570] Finished trial#4 with value: 0.13989178190123577 with parameters: {'GBDTFeatureSelector.threshold': 0.9238391696986176}. Best is trial#3 with value: 0.13946556227894677.
[I 2020-07-

 - 47 features are removed.
 - CV RMSEL: 0.140274
 - test RMSEL: 0.137342
