# 머신러닝 알고리즘

## sklearn 설계 원칙
* 기본 클래스
    - 추정기 (Estimator)
        - fit()
    - 변환기 (Transformer)
        - transform()
    - 예측기 (Predictor)
        - predict()

최근접 이웃 알고리즘 이용 분류

In [6]:
# 데이터 준비
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [18]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
zs = StandardScaler()
zs.fit(X_train)
X_train_zs = zs.transform(X_train)
X_test_zs = zs.transform(X_test)

mm = MinMaxScaler()
mm.fit(X_train)
X_train_mm = mm.transform(X_train)
X_test_mm = mm.transform(X_test)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train_zs, y_train) # 학습
knn_clf.score(X_test_zs, y_test) # 정확도 평가

1.0

In [15]:
y_pred = knn_clf.predict(X_test_zs) # 예측(추론)
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [16]:
y_test

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [17]:
knn_clf.score(X_test_zs, y_test)

1.0

In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train_mm, y_train) # 학습
knn_clf.score(X_test_mm, y_test) # 정확도 평가

1.0

## 나이브베이즈
* 수치형 전처리
    * 누락값: 누락값 처리 추천
    * 이상치: 안해도 됨
    * 특성 스케일링: 안해도 됨
* 범주형
    * 원-핫인코딩        


In [21]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.naive_bayes import ComplementNB, BernoulliNB

gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)
gnb_clf.score(X_test, y_test)

1.0

In [22]:
from sklearn.metrics import accuracy_score
y_pred = gnb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

## 결정트리

In [24]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_train, y_train)
tree_clf.score(X_test, y_test)

0.9666666666666667

In [25]:
from sklearn.tree import export_graphviz
export_graphviz(tree_clf,
                out_file='iris_tree.dot',
                feature_names=iris.feature_names,
                class_names=iris.target_names,
                rounded=True,
                filled=True             
)

In [26]:
!type iris_tree.dot

digraph Tree {
node [shape=box, style="filled, rounded", color="black", fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label="petal width (cm) <= 0.8\ngini = 0.667\nsamples = 120\nvalue = [40, 41, 39]\nclass = versicolor", fillcolor="#fdfffd"] ;
1 [label="gini = 0.0\nsamples = 40\nvalue = [40, 0, 0]\nclass = setosa", fillcolor="#e58139"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="petal length (cm) <= 4.75\ngini = 0.5\nsamples = 80\nvalue = [0, 41, 39]\nclass = versicolor", fillcolor="#f5fef9"] ;
0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
3 [label="gini = 0.053\nsamples = 37\nvalue = [0, 36, 1]\nclass = versicolor", fillcolor="#3ee684"] ;
2 -> 3 ;
4 [label="gini = 0.206\nsamples = 43\nvalue = [0, 5, 38]\nclass = virginica", fillcolor="#9253e8"] ;
2 -> 4 ;
}
