# K-최근접 이웃 분류 확률

In [3]:
import pandas as pd
fish = pd.read_csv('https://bit.ly/fish_csv_data')
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [4]:
# 입력 데이터
fish_input = fish[['Weight', 'Length', 'Diagonal', 'Height', 'Width']].to_numpy()

In [6]:
# 타겟 데이터
fish_target = fish['Species'].to_numpy()

In [7]:
# 훈련 테스트 데이터
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target= train_test_split(fish_input, fish_target, random_state=42)

In [9]:
# 표준화 전처리
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

In [11]:
# K-최근접 이웃 분류 모델 생성
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_scaled, train_target)
print(kn.score(train_scaled, train_target))
print(kn.score(test_scaled, test_target))

0.8907563025210085
0.85


In [24]:
# 확률 출력
import numpy as np
proba = kn.predict_proba(test_scaled[:10])
print(kn.classes_)
print(np.round(proba, decimals=4))
print(test_target[:10])
proba = kn.predict_proba(train_scaled[:10])
print(kn.classes_)
print(np.round(proba, decimals=4))
print(train_target[:10])

['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
[[0.     0.     1.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]
 [0.     0.     0.     1.     0.     0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [1.     0.     0.     0.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]
 [0.     0.     0.3333 0.     0.6667 0.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     0.     0.     1.     0.     0.     0.    ]]
['Perch' 'Smelt' 'Pike' 'Whitefish' 'Perch' 'Bream' 'Smelt' 'Roach'
 'Perch' 'Pike']
['Bream' 'Parkki' 'Perch' 'Pike' 'Roach' 'Smelt' 'Whitefish']
[[1.     0.     0.     0.     0.     0.     0.    ]
 [0.     0.     0.     1.     0.     0.     0.    ]
 [0.     0.     0.     0.     0.     1.     0.    ]
 [0.     0.     0.6667 0.     0.3333 0.     0.    ]
 [0.     1.     0.     0.     0.     0.     0.    ]
 [0.     0

In [22]:
indexes = kn.kneighbors(test_scaled[3:4], return_distance=False)  # test_scaled[3]이 아닌 test_scaled[3:4]로 사용한 이유는 test_scaled[3]은 1차원 배열 출력 test_scaled[3:4]는 2차원 배열 출력
print(train_target[indexes])

[['Roach' 'Perch' 'Perch']]


# 로지스틱 회귀(이진 분류)

In [28]:
# bream과 smelt만 골라내기
bream_smelt_indexes = (train_target == 'Bream') | (train_target == 'Smelt')
train_bream_smelt = train_scaled[bream_smelt_indexes]
target_bream_smelt = train_target[bream_smelt_indexes]

In [30]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_bream_smelt, target_bream_smelt)

LogisticRegression()

In [32]:
print(lr.predict(train_bream_smelt[:5]))

['Bream' 'Smelt' 'Bream' 'Bream' 'Bream']


In [34]:
print(lr.classes_)
print(lr.predict_proba(train_bream_smelt[:5]))

['Bream' 'Smelt']
[[0.99759855 0.00240145]
 [0.02735183 0.97264817]
 [0.99486072 0.00513928]
 [0.98584202 0.01415798]
 [0.99767269 0.00232731]]


In [37]:
print(lr.coef_, lr.intercept_)

[[-0.4037798  -0.57620209 -0.66280298 -1.01290277 -0.73168947]] [-2.16155132]


In [40]:
from scipy.special import expit
decisions = lr.decision_function(train_bream_smelt[:5])
print(expit(decisions)*100)

[ 0.24014519 97.26481722  0.51392835  1.41579793  0.23273111]
