# Python 機械学習クックブック

In [15]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

## 特徴量のスケール変換

In [5]:
# 特徴量を作成
feature = np.array([[-500], [100], [0], [200], [800]])

# スケール変換器を作成
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

# 特徴量をスケール変換
scaled_feature = minmax_scale.fit_transform(feature)
scaled_feature

array([[0.        ],
       [0.46153846],
       [0.38461538],
       [0.53846154],
       [1.        ]])

## 特徴量の標準化

In [6]:
# 特徴量を作成
feature = np.array([[-500], [100], [0], [200], [800]])

# スケール変換器を作成
standard_scale = preprocessing.StandardScaler()

# 特徴量をスケール変換
scaled_feature = standard_scale.fit_transform(feature)
scaled_feature

array([[-1.48804762],
       [-0.04800154],
       [-0.28800922],
       [ 0.19200614],
       [ 1.63205223]])

## 特徴量の正規化

In [10]:
# 特徴量を作成
feature = np.array([[-5, 0], [1, 2], [0, 0], [2, 3], [8, 3]])

# 正規化器を作成
normalizer = preprocessing.Normalizer(norm="l2")

# 特徴量をスケール変換
scaled_feature = normalizer.fit_transform(feature)
scaled_feature

array([[-1.        ,  0.        ],
       [ 0.4472136 ,  0.89442719],
       [ 0.        ,  0.        ],
       [ 0.5547002 ,  0.83205029],
       [ 0.93632918,  0.35112344]])

## 特徴量の離散化

In [27]:
# 特徴量を作成
feature = np.array([[-500], [100], [0], [200], [800]])

# 特徴量を分割
scaled_feature = np.digitize(feature, bins=[0, 200, 500])
scaled_feature

array([[0],
       [1],
       [1],
       [2],
       [3]])

## 外れ値の検出

In [11]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [13]:
# 特徴量を作成
feature, _ = make_blobs(
    n_samples=10,
    n_features=2,
    centers=1,
    random_state=1,
)

feature[0, 0] = 10000
feature[0, 1] = 10000

# 検出器を作成
outlier_detector = EllipticEnvelope(contamination=0.1)

# 検出器を訓練
outlier_detector.fit(feature)

# 外れ値を予測
outlier_detector.predict(feature)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

## 外れ値の取り扱い

In [16]:
houses = pd.DataFrame()
houses["price"] = [533433, 392233, 284402, 4312100]
houses["Bathrooms"] = [2, 3.5, 2, 116]
houses["Square_Feet"] = [1500, 2500, 1500, 48000]

Unnamed: 0,price,Bathrooms,Square_Feet
0,533433,2.0,1500
1,392233,3.5,2500
2,284402,2.0,1500


In [17]:
# 外れ値を捨てる(方法1)
houses[houses["Bathrooms"] < 20]

Unnamed: 0,price,Bathrooms,Square_Feet
0,533433,2.0,1500
1,392233,3.5,2500
2,284402,2.0,1500


In [20]:
# 外れ値に印をつけ、特徴量として取り込む(方法2)
houses["Outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1)

houses

Unnamed: 0,price,Bathrooms,Square_Feet,Outlier
0,533433,2.0,1500,0
1,392233,3.5,2500,0
2,284402,2.0,1500,0
3,4312100,116.0,48000,1


In [23]:
# 特徴量を変換して外れ値の影響を小さくする(方法3)
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]

houses

Unnamed: 0,price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,533433,2.0,1500,0,7.31322
1,392233,3.5,2500,0,7.824046
2,284402,2.0,1500,0,7.31322
3,4312100,116.0,48000,1,10.778956


## クラスタリングによる観測値のグループ分け

In [28]:
from sklearn.cluster import KMeans

In [34]:
# 特徴量を作成
feature, _ = make_blobs(
    n_samples=50,
    n_features=2,
    centers=3,
    random_state=1,
)

df = pd.DataFrame(feature, columns=["feature_1", "feature_2"])

# k-meansクラスタリング器を作成
clusterer = KMeans(3, random_state=0)

# 検出器を訓練
clusterer.fit(feature)

# クラスタリングを実行
df["group"] = clusterer.predict(feature)

df

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,0
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1


## 欠損値の補完

KNNが良い結果になる。
平均値、中央値、最頻値で置き換える方法もある。

In [38]:
from fancyimpute import KNN

In [48]:
# 特徴量を作成
feature, _ = make_blobs(
    n_samples=50,
    n_features=2,
    random_state=1,
)

# スケール変換器を作成
standard_scale = preprocessing.StandardScaler()

# 特徴量をスケール変換
scaled_feature = standard_scale.fit_transform(feature)

true_value = scaled_feature[0, 0]
scaled_feature[0, 0] = np.nan

# 特徴量行列中の欠損値を補完
feature_knn_imputed = KNN(k=5, verbose=0).fit_transform(scaled_feature)

print("真の値:", true_value)
print("補完した値:", feature_knn_imputed[0, 0])

真の値: -1.0482940820583997
補完した値: -1.1087829413395274
