<a href="https://colab.research.google.com/github/steelpipe75/kagglebook-for-colab/blob/master/ch06/ch06-04-filter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---------------------------------
# データ等の準備
# ----------------------------------
import numpy as np
import pandas as pd

In [2]:
import importlib
import sys
import subprocess

# Google Colab 上で実行しているかどうかを判断するフラグ
ON_COLAB = "google.colab" in sys.modules
print(f"ON_COLAB: {ON_COLAB}")

if ON_COLAB:
    USE_GIT = True # Gitを使う
    # USE_GIT = False # Gitを使わない

    print(f"USE_GIT: {USE_GIT}")
    if USE_GIT:
        !git clone https://github.com/ghmagazine/kagglebook.git
    else:
        # Google Drive にマウントする
        drive = importlib.import_module("google.colab.drive")
        drive.mount("/content/drive/")

        import os
        colab_dir = "/content/drive/MyDrive/kagglebook/" # データ置き場

ON_COLAB: True
USE_GIT: True
fatal: destination path 'kagglebook' already exists and is not an empty directory.


train_xは学習データ、train_yは目的変数、test_xはテストデータ
pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

In [3]:
if ON_COLAB:
    if USE_GIT:
        train = pd.read_csv('/content/kagglebook/input/sample-data/train_preprocessed_onehot.csv')
    else:
        train = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/train_preprocessed_onehot.csv'))
else:
    train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
if ON_COLAB:
    if USE_GIT:
        test_x = pd.read_csv('/content/kagglebook/input/sample-data/test_preprocessed_onehot.csv')
    else:
        test_x = pd.read_csv(os.path.join(colab_dir, 'input/sample-data/test_preprocessed_onehot.csv'))
else:
    test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')

In [4]:
# ---------------------------------
# argsortによるインデックスのソート
# ---------------------------------
# argsortを使うことで、配列の値が小さい順／大きい順にインデックスをソートできる
ary = np.array([10, 20, 30, 0])
idx = ary.argsort()
print(idx)  # 昇順 - [3 0 1 2]
print(idx[::-1])  # 降順 - [2 1 0 3]

[3 0 1 2]
[2 1 0 3]


In [5]:
print(ary[idx[::-1][:3]])  # ベスト3を出力 - [30, 20, 10]

[30 20 10]


In [6]:
# ---------------------------------
# 相関係数
# ---------------------------------
import scipy.stats as st

In [7]:
# 相関係数
corrs = []
for c in train_x.columns:
    corr = np.corrcoef(train_x[c], train_y)[0, 1]
    corrs.append(corr)
corrs = np.array(corrs)

In [8]:
# スピアマンの順位相関係数
corrs_sp = []
for c in train_x.columns:
    corr_sp = st.spearmanr(train_x[c], train_y).correlation
    corrs_sp.append(corr_sp)
corrs_sp = np.array(corrs_sp)

In [9]:
# 重要度の上位を出力する（上位5個まで）
# np.argsortを使うことで、値の順序のとおりに並べたインデックスを取得できる
idx = np.argsort(np.abs(corrs))[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)

['medical_info_a1' 'medical_keyword_5' 'medical_keyword_4'
 'medical_keyword_3' 'age'] [0.21805214 0.21368557 0.18109642 0.16723961 0.15155308]


In [10]:
idx2 = np.argsort(np.abs(corrs_sp))[::-1]
top_cols2, top_importances2 = train_x.columns.values[idx][:5], corrs_sp[idx][:5]
print(top_cols2, top_importances2)

['medical_info_a1' 'medical_keyword_5' 'medical_keyword_4'
 'medical_keyword_3' 'age'] [0.22182331 0.21368557 0.18109642 0.16723961 0.15170291]


In [11]:
# ---------------------------------
# カイ二乗統計量
# ---------------------------------
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [12]:
# カイ二乗統計量
x = MinMaxScaler().fit_transform(train_x)
c2, _ = chi2(x, train_y)

In [13]:
# 重要度の上位を出力する（上位5個まで）
idx = np.argsort(c2)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)

['medical_keyword_5' 'medical_keyword_4' 'medical_keyword_3' 'product_9'
 'medical_keyword_2'] [0.21368557 0.18109642 0.16723961 0.11706115 0.1184609 ]


In [14]:
# ---------------------------------
# 相互情報量
# ---------------------------------
from sklearn.feature_selection import mutual_info_classif

In [15]:
# 相互情報量
mi = mutual_info_classif(train_x, train_y)

In [16]:
# 重要度の上位を出力する（上位5個まで）
idx = np.argsort(mi)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)

['medical_info_a1' 'weight' 'age' 'medical_keyword_5' 'medical_keyword_4'] [0.21805214 0.00437808 0.15155308 0.21368557 0.18109642]
