# Iris

---

## 붓꽃 분류

- 지도학습
- 꽃잎(petal)과 꽃받침(sepal)의 폭과 길이 값만 보고 붓꽃의 종류를 분류하는 문제
- 붓 꽃의 종류(품종): setosa, versicolor, virginica

---

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

# Pandas option
#pd.set_option('max_rows', 30)
#pd.set_option('max_columns', 15)

# 시각화 스타일 설정
# plt.style.use('fivethirtyeight') 
# sns.set_style('whitegrid')

#import warnings
#warnings.filterwarnings('ignore')

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
#rcParams['axes.grid'] = True

In [2]:
# Machine Learning Library

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

# accuracy measure
from sklearn import metrics

# Load Iris data set
from sklearn.datasets import load_iris

### Load data set

In [3]:
ds_iris = load_iris()
ds_iris

 'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [4]:
type(ds_iris)

sklearn.utils.Bunch

In [5]:
ds_iris.keys()

dict_keys(['target_names', 'feature_names', 'target', 'filename', 'DESCR', 'data'])

### Iris data set

- DESCR(description): 데이터셋에 대한 설명
- data
  - data: 꽃잎, 꽃받침의 너비 및 길이 데이터
  - feature_names: 데이터의 이름
- target
  - target: 품종 데이터( 0, 1, 2 )
  - target_names: 품종의 이름


In [6]:
print(ds_iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [7]:
ds_iris['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [8]:
ds_iris['data']

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [9]:
ds_iris['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [10]:
ds_iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [11]:
ds_iris['data'].shape

(150, 4)

In [17]:
ds_iris['target'].shape

(150,)

### 데이터 분포

In [18]:
df_iris = pd.DataFrame(ds_iris['data'], columns=ds_iris.feature_names)

pd.plotting.scatter_matrix(df_iris, c=ds_iris['target'], figsize=(12, 12), marker='o',
                           hist_kwds={'bins': 20}, s=60, alpha=.8)
plt.show()

AttributeError: module 'pandas' has no attribute 'plotting'

### 머신러닝: 품종 예측

#### 데이터 분리: 학습데이터 + 테스트 데이터

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    ds_iris['data'], ds_iris['target'], random_state=123)

In [20]:
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)

X_train shape: (112, 4)
y_train shape: (112,)


In [21]:
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_test shape: (38, 4)
y_test shape: (38,)


#### k-NN(k-최근접 이웃 알고리즘)

In [33]:
# 모델 생성
knn = KNeighborsClassifier(n_neighbors=3)

In [34]:
# 모델 학습
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [35]:
# 결과 예측
prediction = knn.predict(X_test)
prediction

array([2, 2, 2, 1, 0, 2, 1, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0, 0, 2,
       0, 2, 0, 0, 0, 2, 2, 0, 2, 1, 0, 0, 1, 1, 2, 0])

In [36]:
# 정확도 확인
metrics.accuracy_score(prediction, y_test)

#np.mean(prediction == y_test)

#knn.score(X_test, y_test)

0.9473684210526315

In [37]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.88      0.88      0.88         8
           2       0.93      0.93      0.93        14

   micro avg       0.95      0.95      0.95        38
   macro avg       0.93      0.93      0.93        38
weighted avg       0.95      0.95      0.95        38



In [38]:
# Confusion Matrix
pd.crosstab(prediction, y_test, margins=True)

col_0,0,1,2,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,16,0,0,16
1,0,7,1,8
2,0,1,13,14
All,16,8,14,38


#### 새로운 데이터 품종 예측

In [39]:
# 데이터 생성
X_new = np.array([[5, 2.9, 1, 0.2], [4,2,3,4], [3,5,4,1]])
X_new

array([[5. , 2.9, 1. , 0.2],
       [4. , 2. , 3. , 4. ],
       [3. , 5. , 4. , 1. ]])

In [40]:
# 품종 예측(0, 1, 2)
pred = knn.predict(X_new)
pred

array([0, 1, 0])

In [41]:
# 품종 이름 출력
ds_iris['target_names'][pred]

array(['setosa', 'versicolor', 'setosa'], dtype='<U10')

### Summary

 - 데이터 준비(훈련데이터, 테스트데이터 나누기)
 - 모델 생성
 - 학습 (fit)
 - 결과검증( 예측, 정확도 확인)

In [31]:
# 데이터 준비
X_train, X_test, y_train, y_test = train_test_split(
    ds_iris['data'], ds_iris['target'], random_state=123)

# 모델 생성
knn = KNeighborsClassifier(n_neighbors=3)

# 모델 학습
knn.fit(X_train, y_train)

# 정확도 확인
knn.score(X_test, y_test)


0.9473684210526315

In [32]:
# end of file