# Python機械学習クックブック

In [1]:
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

## 観測値の近傍の発見

In [3]:
iris = datasets.load_iris()
features = iris.data

# 標準化器の作成
standardizer = StandardScaler()

# 特徴量の標準化
features_standardized = standardizer.fit_transform(features)

# 2-最近傍法分類器の作成
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features_standardized)

# ある観測値と最も近い観測値のインデックスを取得
new_observation = [1, 1, 1, 1]
distances, indices = nearest_neighbors.kneighbors([new_observation])

# 最も近い観測値を表示
print(features_standardized[indices])

# 距離を表示
print(distances)

[[[1.03800476 0.55861082 1.10378283 1.18556721]
  [0.79566902 0.32841405 0.76275827 1.05393502]]]
[[0.49140089 0.74294782]]


## k-最近傍法クラス分類器の作成

In [5]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

# 標準化器の作成
standardizer = StandardScaler()

# 特徴量の標準化
features_standardized = standardizer.fit_transform(features)

# 近傍数を5にしたKNN分類器の作成
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(
    features_standardized, target
)

# 新しい観測値を2つ作成
new_observation = [[0.75, 0.75, 0.75, 0.75], [1, 1, 1, 1]]

# 2つの観測値のクラスを予測
print(knn.predict(new_observation))

# 2つの観測値のクラスの予測確率を表示
print(knn.predict_proba(new_observation))

[1 2]
[[0.  0.6 0.4]
 [0.  0.  1. ]]


## 最適な近傍サイズの特定

In [7]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

# 標準化器の作成
standardizer = StandardScaler()

# 特徴量の標準化
features_standardized = standardizer.fit_transform(features)

# 近傍数を5にしたKNN分類器の作成
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(
    features_standardized, target
)

# パイプラインの作成
pipe = Pipeline([("standardizer", standardizer), ("knn", knn)])

# ハイパーパラメータの作成
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

# グリッドサーチの作成
classifier = GridSearchCV(
    pipe, search_space, cv=5, verbose=0).fit(features_standardized, target)

# 最良の近傍数を表示
print(classifier.best_estimator_.get_params()["knn__n_neighbors"])


6


## 半径を用いた最近傍クラス分類器の作成

In [8]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

# 標準化器の作成
standardizer = StandardScaler()

# 特徴量の標準化
features_standardized = standardizer.fit_transform(features)

# 近傍数を5にしたKNN分類器の作成
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(
    features_standardized, target
)

# 観測値を作成
new_observations = [[1, 1, 1, 1]]

# 観測値のクラスを予測
print(knn.predict(new_observations))

[2]
