# Classification

---

In [2]:
import numpy as np
import pandas as pd

#### 데이터 로드

In [2]:
df = pd.read_csv('data/fish.csv')
df.head()

Unnamed: 0,Type,Length,Depth,Kg
0,tuna,55.46,13.24093,8.631268
1,tuna,72.07,17.671346,19.288996
2,tuna,62.84,15.635367,13.140561
3,tuna,72.79,18.440841,19.669098
4,tuna,76.63,19.137488,23.021797


#### 길이의 제곱 컬럼 생성

In [3]:
df['L2'] = df['Length'] * df['Length']
df.head()

Unnamed: 0,Type,Length,Depth,Kg,L2
0,tuna,55.46,13.24093,8.631268,3075.8116
1,tuna,72.07,17.671346,19.288996,5194.0849
2,tuna,62.84,15.635367,13.140561,3948.8656
3,tuna,72.79,18.440841,19.669098,5298.3841
4,tuna,76.63,19.137488,23.021797,5872.1569


#### 길이와 무게 비율 컬럼 생성

In [4]:
df['LKgRatio'] = df['Kg'] / df['Length']
df.head()

Unnamed: 0,Type,Length,Depth,Kg,L2,LKgRatio
0,tuna,55.46,13.24093,8.631268,3075.8116,0.155631
1,tuna,72.07,17.671346,19.288996,5194.0849,0.267643
2,tuna,62.84,15.635367,13.140561,3948.8656,0.209111
3,tuna,72.79,18.440841,19.669098,5298.3841,0.270217
4,tuna,76.63,19.137488,23.021797,5872.1569,0.300428


#### 컬럼 추가(isTuna) - 참치: 1, 나머지: 0

In [5]:
df['isTuna'] = df['Type'].apply(lambda x: 1 if x == 'tuna' else 0)

#### 컬럼 추가( TypeNum) - 참치: 0, 연어: 1, 고등어:2

In [6]:
df['TypeNum'] = df['Type'].apply(lambda x: 0 if x == 'tuna' else 1 if x == 'salmon' else 2)
df

Unnamed: 0,Type,Length,Depth,Kg,L2,LKgRatio,isTuna,TypeNum
0,tuna,55.46,13.240930,8.631268,3075.8116,0.155631,1,0
1,tuna,72.07,17.671346,19.288996,5194.0849,0.267643,1,0
2,tuna,62.84,15.635367,13.140561,3948.8656,0.209111,1,0
3,tuna,72.79,18.440841,19.669098,5298.3841,0.270217,1,0
4,tuna,76.63,19.137488,23.021797,5872.1569,0.300428,1,0
5,tuna,61.31,14.870117,12.012898,3758.9161,0.195937,1,0
6,tuna,73.56,18.334414,21.763799,5411.0736,0.295865,1,0
7,tuna,65.88,16.436831,14.398289,4340.1744,0.218553,1,0
8,tuna,63.03,15.390727,13.246244,3972.7809,0.210158,1,0
9,tuna,71.92,17.844251,19.424579,5172.4864,0.270086,1,0


# 1. Logistic Regression

In [7]:
X = df.loc[:,['Length','Depth']]
Y = df['isTuna']

In [8]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3 ,random_state=123)

In [9]:
from sklearn.linear_model import LogisticRegression
# 모델 생성
model = LogisticRegression(solver='lbfgs')

In [10]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
# 모델 학습
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
# 결과 예측
prediction1 = model.predict(X_test)
prediction1

array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,

In [13]:
prediction1[0]

1

In [14]:
prediction1[100]

0

In [15]:
pd_predic = pd.DataFrame(prediction1)
pd_predic.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,1


In [16]:
# 정확도 확인
from sklearn.metrics import accuracy_score
print(accuracy_score(prediction1, y_test))

0.8622222222222222


In [17]:
from sklearn import metrics

In [18]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction1))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90       304
           1       0.78      0.80      0.79       146

   micro avg       0.86      0.86      0.86       450
   macro avg       0.84      0.85      0.84       450
weighted avg       0.86      0.86      0.86       450



In [19]:
# Confusion Matrix 빈도수 만들기
pd.crosstab(prediction1, y_test, margins=True)

isTuna,0,1,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,271,29,300
1,33,117,150
All,304,146,450


---

# 2. Support Vector Machine

In [20]:
df.head()

Unnamed: 0,Type,Length,Depth,Kg,L2,LKgRatio,isTuna,TypeNum
0,tuna,55.46,13.24093,8.631268,3075.8116,0.155631,1,0
1,tuna,72.07,17.671346,19.288996,5194.0849,0.267643,1,0
2,tuna,62.84,15.635367,13.140561,3948.8656,0.209111,1,0
3,tuna,72.79,18.440841,19.669098,5298.3841,0.270217,1,0
4,tuna,76.63,19.137488,23.021797,5872.1569,0.300428,1,0


In [21]:
X = df.loc[:,['Length','Depth']]
Y = df['TypeNum']

In [22]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3 ,random_state=123)

In [23]:
from sklearn.svm import SVC
# 모델 생성
model = SVC(kernel='linear', C=0.1, gamma=0.1)

In [24]:
# 모델 학습
model.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
# 결과 예측
prediction2 = model.predict(X_test)
prediction2

array([0, 2, 1, 2, 0, 0, 2, 1, 1, 1, 1, 2, 2, 0, 0, 0, 2, 0, 1, 1, 2, 0,
       2, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0,
       2, 2, 1, 2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 1, 1, 0, 1, 1, 2, 0,
       0, 2, 0, 1, 2, 2, 0, 1, 2, 1, 0, 2, 1, 1, 0, 2, 1, 0, 2, 2, 0, 1,
       2, 1, 2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 2, 2, 0, 1, 2, 2, 1, 1, 0, 0,
       1, 2, 1, 2, 1, 1, 2, 1, 0, 0, 1, 1, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       2, 1, 1, 0, 2, 1, 1, 0, 2, 2, 2, 1, 1, 1, 0, 0, 2, 1, 2, 1, 2, 2,
       2, 0, 1, 2, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0, 2, 0, 2, 1, 0, 1, 1, 2,
       1, 2, 1, 2, 0, 0, 0, 1, 1, 1, 0, 2, 1, 2, 0, 1, 0, 1, 1, 2, 0, 2,
       1, 2, 0, 1, 1, 0, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 0, 1, 1,
       2, 1, 0, 0, 1, 2, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2,
       1, 2, 1, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 2,
       0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 2, 1, 2, 1, 1, 1, 0, 0, 1,
       2, 1, 1, 0, 2, 1, 2, 0, 0, 0, 0, 2, 0, 2, 1,

In [26]:
# 정확도 확인
from sklearn.metrics import accuracy_score
print(accuracy_score(prediction2, y_test))

0.8622222222222222


In [27]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction2))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79       146
           1       0.82      0.79      0.81       163
           2       1.00      1.00      1.00       141

   micro avg       0.86      0.86      0.86       450
   macro avg       0.87      0.87      0.87       450
weighted avg       0.86      0.86      0.86       450



## 2.1 Support Vector Machine - PCA

In [28]:
df.head()

Unnamed: 0,Type,Length,Depth,Kg,L2,LKgRatio,isTuna,TypeNum
0,tuna,55.46,13.24093,8.631268,3075.8116,0.155631,1,0
1,tuna,72.07,17.671346,19.288996,5194.0849,0.267643,1,0
2,tuna,62.84,15.635367,13.140561,3948.8656,0.209111,1,0
3,tuna,72.79,18.440841,19.669098,5298.3841,0.270217,1,0
4,tuna,76.63,19.137488,23.021797,5872.1569,0.300428,1,0


In [29]:
X = df.loc[:,['Length','Depth','Kg','L2','LKgRatio']]
Y = df['TypeNum']

In [30]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3 ,random_state=123)
X_train.head()

Unnamed: 0,Length,Depth,Kg,L2,LKgRatio
891,62.69,15.79747,7.021431,3930.0361,0.112002
1385,39.07,7.885989,1.343569,1526.4649,0.034389
1186,42.02,8.244949,1.960338,1765.6804,0.046652
489,68.43,16.813894,15.929744,4682.6649,0.232789
1236,43.48,8.976522,1.708537,1890.5104,0.039295


In [65]:
from sklearn.decomposition import PCA
# PCA 모델 생성
pca = PCA(n_components=2)

# PCA Transform
X_train = pca.fit_transform(X_train)


In [69]:
X_train

array([[ 5.79067539e+02, -4.40272644e+00],
       [-1.82463222e+03,  1.41413213e+00],
       [-1.58539940e+03,  6.90576262e-01],
       ...,
       [-1.92679811e+03,  1.82125134e+00],
       [-1.78926438e+03,  1.13508363e+00],
       [-9.73507159e+02, -1.78337838e+00]])

In [66]:
X_train.shape

(1050, 2)

In [67]:
X_test = pca.transform(X_test)

In [70]:
X_test.shape

(450, 2)

In [35]:
from sklearn.svm import SVC
# 모델 생성
model = SVC(kernel='linear', C=0.1, gamma=0.1)

# 모델 학습
#model.fit(X_train, y_train.values.ravel())
model.fit(X_train, y_train)

# 결과 예측
prediction2 = model.predict(X_test)
prediction2

from sklearn.metrics import accuracy_score
# 정확도 확인
print(accuracy_score(prediction2, y_test))

0.9844444444444445


---

# 3. Decision Tree

In [36]:
X = df.loc[:,['Length','Depth','Kg','L2','LKgRatio']]
Y = df['TypeNum']

In [37]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3 ,random_state=123)

In [38]:
from sklearn.tree import DecisionTreeClassifier
# 모델 생성
model = DecisionTreeClassifier()

In [43]:
# 모델 학습
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [44]:
# 결과 예측
prediction3 = model.predict(X_test)
prediction3

array([1, 2, 1, 2, 0, 0, 2, 1, 1, 1, 1, 2, 2, 0, 0, 0, 2, 0, 1, 1, 2, 0,
       2, 0, 1, 0, 2, 1, 2, 1, 2, 2, 1, 0, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1,
       2, 2, 1, 2, 2, 0, 0, 1, 1, 0, 1, 0, 0, 2, 2, 1, 1, 0, 1, 1, 2, 0,
       1, 2, 0, 1, 2, 2, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2, 0, 0, 2, 2, 0, 1,
       2, 1, 2, 2, 0, 0, 1, 2, 0, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 1, 0, 1,
       1, 2, 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 2, 1, 2, 0,
       2, 1, 1, 0, 2, 1, 1, 0, 2, 2, 2, 1, 1, 1, 0, 0, 2, 0, 2, 1, 2, 2,
       2, 1, 1, 2, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0, 2, 1, 2, 1, 0, 1, 1, 2,
       1, 2, 1, 2, 0, 1, 0, 1, 1, 1, 1, 2, 1, 2, 0, 0, 0, 1, 1, 2, 1, 2,
       1, 2, 0, 1, 1, 0, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 0, 1, 1,
       2, 0, 0, 0, 0, 2, 0, 1, 0, 2, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 2,
       1, 2, 1, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 2,
       0, 2, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 0, 0, 1,
       2, 0, 1, 0, 2, 1, 2, 0, 0, 0, 0, 2, 0, 2, 1,

In [45]:
model.predict_proba(X_test)

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [46]:
from sklearn.metrics import accuracy_score
print(accuracy_score(prediction3, y_test))

1.0


---

# 4. Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier
# 모델 생성
model = RandomForestClassifier(n_estimators=340)

In [55]:
RandomForestClassifier?

In [48]:
# 모델 학습
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=340, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [49]:
# 결과 예측
prediction4 = model.predict(X_test)
prediction4

array([1, 2, 1, 2, 0, 0, 2, 1, 1, 1, 1, 2, 2, 0, 0, 0, 2, 0, 1, 1, 2, 0,
       2, 0, 1, 0, 2, 1, 2, 1, 2, 2, 1, 0, 1, 0, 0, 1, 2, 1, 0, 0, 0, 1,
       2, 2, 1, 2, 2, 0, 0, 1, 1, 0, 1, 0, 0, 2, 2, 1, 1, 0, 1, 1, 2, 0,
       1, 2, 0, 1, 2, 2, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2, 0, 0, 2, 2, 0, 1,
       2, 1, 2, 2, 0, 0, 1, 2, 0, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 1, 0, 1,
       1, 2, 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 2, 1, 2, 0,
       2, 1, 1, 0, 2, 1, 1, 0, 2, 2, 2, 1, 1, 1, 0, 0, 2, 0, 2, 1, 2, 2,
       2, 1, 1, 2, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0, 2, 1, 2, 1, 0, 1, 1, 2,
       1, 2, 1, 2, 0, 1, 0, 1, 1, 1, 1, 2, 1, 2, 0, 0, 0, 1, 1, 2, 1, 2,
       1, 2, 0, 1, 1, 0, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 0, 1, 1,
       2, 0, 0, 0, 0, 2, 0, 1, 0, 2, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 2,
       1, 2, 1, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 2,
       0, 2, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 0, 0, 1,
       2, 0, 1, 0, 2, 1, 2, 0, 0, 0, 0, 2, 0, 2, 1,

In [50]:
# 정확도 확인
print(accuracy_score(prediction4, y_test))

0.9977777777777778


---

# 4. KMeans Clustering

In [3]:
df = pd.read_csv("data/iris.csv")
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df['Species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: Species, dtype: int64

In [5]:
X = df.iloc[:,0:4]

In [6]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])

In [8]:
dir(kmeans)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_fit_data',
 '_check_test_data',
 '_estimator_type',
 '_get_param_names',
 '_transform',
 'algorithm',
 'cluster_centers_',
 'copy_x',
 'fit',
 'fit_predict',
 'fit_transform',
 'get_params',
 'inertia_',
 'init',
 'labels_',
 'max_iter',
 'n_clusters',
 'n_init',
 'n_iter_',
 'n_jobs',
 'precompute_distances',
 'predict',
 'random_state',
 'score',
 'set_params',
 'tol',
 'transform',
 'verbose']

In [59]:
kmeans.predict(X)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])

In [86]:
kmeans.cluster_centers_

array([[5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])

# 5. Nearest Neighbors

In [63]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2, algorithm='kd_tree').fit(X)

In [78]:
X.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [81]:
X.iloc[39]

Sepal.Length    5.1
Sepal.Width     3.4
Petal.Length    1.5
Petal.Width     0.2
Name: 39, dtype: float64

In [112]:
A=neigh.kneighbors(X, 5, return_distance=True)

In [98]:
A[100]

array([100, 136, 144, 104, 143], dtype=int64)

In [115]:
A[0][100]

array([0.        , 0.42426407, 0.5       , 0.50990195, 0.55677644])

array([[0.        , 0.1       , 0.14142136, 0.14142136, 0.14142136],
       [0.        , 0.14142136, 0.14142136, 0.14142136, 0.17320508],
       [0.        , 0.14142136, 0.24494897, 0.26457513, 0.26457513],
       [0.        , 0.14142136, 0.17320508, 0.2236068 , 0.24494897],
       [0.        , 0.14142136, 0.14142136, 0.17320508, 0.17320508],
       [0.        , 0.33166248, 0.34641016, 0.36055513, 0.37416574],
       [0.        , 0.2236068 , 0.26457513, 0.3       , 0.31622777],
       [0.        , 0.1       , 0.14142136, 0.17320508, 0.2       ],
       [0.        , 0.14142136, 0.3       , 0.31622777, 0.34641016],
       [0.        , 0.1       , 0.17320508, 0.17320508, 0.17320508],
       [0.        , 0.1       , 0.28284271, 0.3       , 0.33166248],
       [0.        , 0.2236068 , 0.2236068 , 0.28284271, 0.3       ],
       [0.        , 0.14142136, 0.17320508, 0.2       , 0.2       ],
       [0.        , 0.24494897, 0.31622777, 0.34641016, 0.47958315],
       [0.        , 0.41231056, 0.

In [110]:
A

(array([[0.        , 0.1       , 0.14142136, 0.14142136, 0.14142136],
        [0.        , 0.14142136, 0.14142136, 0.14142136, 0.17320508],
        [0.        , 0.14142136, 0.24494897, 0.26457513, 0.26457513],
        [0.        , 0.14142136, 0.17320508, 0.2236068 , 0.24494897],
        [0.        , 0.14142136, 0.14142136, 0.17320508, 0.17320508],
        [0.        , 0.33166248, 0.34641016, 0.36055513, 0.37416574],
        [0.        , 0.2236068 , 0.26457513, 0.3       , 0.31622777],
        [0.        , 0.1       , 0.14142136, 0.17320508, 0.2       ],
        [0.        , 0.14142136, 0.3       , 0.31622777, 0.34641016],
        [0.        , 0.1       , 0.17320508, 0.17320508, 0.17320508],
        [0.        , 0.1       , 0.28284271, 0.3       , 0.33166248],
        [0.        , 0.2236068 , 0.2236068 , 0.28284271, 0.3       ],
        [0.        , 0.14142136, 0.17320508, 0.2       , 0.2       ],
        [0.        , 0.24494897, 0.31622777, 0.34641016, 0.47958315],
        [0.        ,

In [None]:
# end of file