In [None]:
from sklearn.datasets  import load_iris
from sklearn.neighbors import KNeighborsClassifier

iris= load_iris()
X, y= iris.data, iris.target

C= KNeighborsClassifier()



## k-nearest neighbors algorithm

https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

![](https://goo.gl/Br5fke)

-   Example of k-NN classification. 
-   The test sample (green circle) should be classified 
-   either to the first class of blue squares 
-   or to the second class of red triangles. 

-   If k = 3 (solid line circle) it is assigned to the second class 
-   because there are 2 triangles and only 1 square inside the inner circle. 

-   If k = 5 (dashed line circle) it is assigned to the first class 
-   (3 squares vs. 2 triangles inside the outer circle).

已知 data 分為 藍色四邊形(第一類) 與 紅色三角形(第二類)，
每個data 有二維座標(x,y) 故可畫在平面上，散佈如上。
今有未知 綠色data，其亦有 二維座標(x,y)。

如何決定其所屬類別？
1. 以未知點為中心，尋找最近之 k 點已知類別的 data，
2. 由 它們投票，取最多數決定所屬類別。

以上例而言，
- 若 k==1， 則 判定 未知點屬於 紅色三角形
- 若 k==3， 則 判定 未知點屬於 紅色三角形
- 若 k==5， 則 判定 未知點屬於 藍色四邊形

In [None]:
C
# n_neighbors=5 可更動，最簡單的情形是 n_neighbors == 1

In [None]:
X, y, C  # data, label, classifier

In [None]:
# 設定分類器
C= KNeighborsClassifier(n_neighbors=1)

# 訓練
C.fit(X, y)

# 辨識
C.predict(X)


In [None]:
# 看正確率
C.predict(X)==y

## 以上，就做出一個最簡單的分類器了， pattern recognition 的核心全貌便已浮現。


一般而言，我們會把 資料分為 訓練集 與 測試集，
分別用來 訓練與測試所設計的分類器。

用來測試的資料不能出現於訓練集中，
否則便算是作弊！
所做的分類器之辨識率就不足採信。

簡單的把資料分為偶數集(序號從0開始，間格為2)、奇數集(序號從1開始，間格為2)如下：
偶數集當訓練集，奇數集當測試集。


In [None]:
X0= X[0::2]; X1= X[1::2]
y0= y[0::2]; y1= y[1::2]

In [None]:
# 訓練
C.fit(X0, y0)

# 辨識 訓練集 (inside test)
z0= C.predict(X0)

# 辨識 測試集 (outside test)
z1= C.predict(X1)

z0, z1

In [None]:
# 辨識率 inside; (幾乎) 百分百 正確 (其實也沒有保證)
z0==y0

In [None]:
# 辨識率 outside; 有一些錯誤。 (有錯是正常，如何降低錯誤率則是整個 pattern recognition 研究的重心！)
z1==y1

一般而言，如何分割 data 成 訓練集 與 測試集，也有一些常見的經驗法則， scikit learn 提供了以下的方法：


In [None]:
from sklearn.cross_validation import train_test_split

train_X, test_X, train_y, test_y= train_test_split(X, y, 
                                                   train_size=  0.5, 
                                                   random_state=123, 
                                                   stratify= y)
print("Labels for training and testing data")
print(train_y)
print(test_y)
'''
[1 1 1 0 0 2 1 1 1 0 1 0 2 0 0 2 0 2 1 1 0 0 2 1 2 1 0 1 1 1 2 1 2 2 0 0 2
 2 0 0 2 2 2 2 0 2 0 2 1 1 0 2 2 0 2 1 2 1 2 1 1 0 0 1 2 0 0 2 2 1 0 1 0 0
 1]
[0 2 1 0 2 0 1 2 0 0 2 1 2 0 1 2 2 2 2 2 1 2 1 1 2 2 0 0 1 0 0 2 0 1 0 0 1
 1 2 2 0 1 0 1 1 2 0 1 1 1 0 2 2 2 1 0 0 1 1 0 2 1 0 2 0 2 1 1 2 0 2 1 0 0
 1]
'''
# 靠著 random_state=123 可把 randomize 的情形固定下來，寫程式過程中有一些助益！
# 靠著 stratify= y 可讓 類別分布 均勻，有助於辨識器的辨識效能。

In [None]:
import numpy as np
np.bincount(train_y)

In [None]:
# 訓練
C.fit(train_X, train_y)

# 辨識 訓練集 (inside test)
train_z= C.predict(train_X)

# 辨識 測試集 (outside test)
test_z= C.predict(test_X)

train_z, test_z

In [None]:
train_z==train_y

In [None]:
test_z==test_y

# 計算錯誤率

In [None]:
list(train_z==train_y).count(False)/len(train_z)

In [None]:
list(test_z==test_y).count(False)/len(test_z)

# 從頭來過，改變分類器的參數。
看看有沒有可能提升辨識效能，降低錯誤率！

In [None]:
# 設定分類器
C= KNeighborsClassifier(n_neighbors= 2) # n_neighbors= 1 -->2 -->3

# 訓練
C.fit(train_X, train_y)

# 辨識 訓練集 (inside test)
train_z= C.predict(train_X)

# 辨識 測試集 (outside test)
test_z= C.predict(test_X)

# 計算錯誤率
err0= list(train_z==train_y).count(False)/len(train_z)
err1= list(test_z==test_y).count(False)/len(test_z)

print('err0= {}, err1= {}'.format(err0,err1))

In [None]:
aL= []
for k in range(1,10):
    # 設定分類器
    C= KNeighborsClassifier(n_neighbors= k) # n_neighbors= 1 -->2 -->3

    # 訓練
    C.fit(train_X, train_y)

    # 辨識 訓練集 (inside test)
    train_z= C.predict(train_X)

    # 辨識 測試集 (outside test)
    test_z= C.predict(test_X)

    # 計算錯誤率
    err0= list(train_z==train_y).count(False)/len(train_z)
    err1= list(test_z==test_y).count(False)/len(test_z)

    print('k={}, err0= {}, err1= {}'.format(k, err0, err1))
    
    # 把實驗數據存起來，進一步做分析。
    aL += [(k, err0, err1)]
    

In [None]:
aL


# 作圖觀察 k 與 錯誤率 的 關係

In [None]:
%pylab inline
import pylab as pl

# 作圖觀察 k 與 錯誤率 的 關係

A= np.array(aL)

pl.plot(A[:,0], A[:,1], c='r', marker='o')
pl.plot(A[:,0], A[:,2], c='b', marker='o')

pl.xlabel('k')
pl.ylabel('err')
pl.grid('on')

#pl.show()



# 討論
## 上圖顯示，在 本實驗之設定下， 在 k= [1..9] 之中，k=6, k=8 使得 err1 最低。
##  err0 (inside test) 最低 未必 導致 err1(outside test) 最低。


# 以下跟隨教科書 ch03，

瀏覽 幾個 分類器，快速走一遍。
每個分類器的原理雖各不相同，我們可以先把他們當黑盒子看待，先做出辨識率。

然後逐一在 wikipedia 上找到相關文章，把原理做一番研究。


## perceptron 


In [5]:
from sklearn import datasets
import numpy as np

iris= datasets.load_iris()
X= iris.data[:, [2, 3]]
y= iris.target

print('Class labels:', np.unique(y))


Class labels: [0 1 2]


In [4]:
# Added version check for recent scikit-learn 0.18 checks
'''
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
Version(sklearn_version)
if Version(sklearn_version) < '0.18':
    from sklearn.cross_validation import train_test_split
else:
    from sklearn.model_selection import train_test_split
'''

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(
    X, y, test_size=0.3, random_state=0)

In [33]:
X_train_std= X_train
X_test_std=  X_test


# data 預先 正規化，有助於 辨識效能

#'''
from sklearn.preprocessing import StandardScaler

sc= StandardScaler()
sc.fit(X_train)
X_train_std= sc.transform(X_train)
X_test_std=  sc.transform(X_test)
#'''




In [34]:
from sklearn.linear_model import Perceptron

ppn= Perceptron(n_iter=40, eta0=0.1, random_state=0)
ppn.fit(X_train_std, y_train)


Perceptron(alpha=0.0001, class_weight=None, eta0=0.1, fit_intercept=True,
      n_iter=40, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [35]:
y_test.shape

(45,)

In [36]:
y_pred = ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())


Misclassified samples: 4


In [31]:
from sklearn.metrics import accuracy_score
acc= accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % acc)


Accuracy: 0.91


## logistic regression

In [37]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, y_train)


LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [38]:
lr.predict_proba(X_test_std[0, :].reshape(1, -1))


array([[  2.05743774e-11,   6.31620264e-02,   9.36837974e-01]])

In [40]:
y_pred= lr.predict(X_test_std)

In [41]:
acc= accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % acc)

Accuracy: 0.98


## SVM

In [46]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train_std, y_train)
y_pred= svm.predict(X_test_std)
acc= accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % acc)


Accuracy: 0.98


In [45]:
svm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0)
svm.fit(X_train_std, y_train)
y_pred= svm.predict(X_test_std)
acc= accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % acc)


Accuracy: 0.98


## decision tree


In [49]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
tree.fit(X_train_std, y_train)

y_pred= tree.predict(X_test_std)
acc= accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % acc)

Accuracy: 0.98


# random forests

In [51]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=10, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train_std, y_train)

y_pred= forest.predict(X_test_std)
acc= accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % acc)

Accuracy: 0.96


# knn

In [58]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=4, p=2, metric='minkowski')
knn.fit(X_train_std, y_train)

y_pred= knn.predict(X_test_std)
acc= accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % acc)

Accuracy: 1.00
