# K-Nearest Neighbors


## 1.환경준비

### (1) Import

In [2]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

### (2) data loading

In [3]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/boston.csv'
data = pd.read_csv(path)

|	변수	|	설명	|
|	----	|	----	|
|	**medv**	|	**타운별 집값(중위수), target**	|
|	crim	|	범죄율	|
|	zn	|	25,000 평방피트를 초과한 거주지역 비율	|
|	indus	|	비소매상업지역 면적 비율, 편의시설(관공서, 주요 시설)	|
|	chas	|	찰스강변 위치(범주 : 강변1, 아니면 0)	|
|	nox	|	일산화질소 농도	|
|	rm	|	주택당 방 수	|
|	age	|	1940년 이전에 건축된 주택의 비율	|
|	dis	|	직업센터의 거리	|
|	rad	|	방사형 고속도로까지의 거리	|
|	tax	|	재산세율	|
|	ptratio	|	학생/교사 비율	|
|	black	|	인구 중 흑인 비율	|
|	lstat	|	인구 중 하위 계층 비율	|


## 2.데이터 이해

### (1) 둘러보기

In [4]:
# 상/하위 몇개 행을 살펴 봅시다.
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [5]:
# 각 칼럼의 타입을 살펴 봅시다.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


## 3.데이터 준비

### (1) 데이터 정리

In [6]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [7]:
data.drop(['black'], axis = 1, inplace = True)

### (2) 데이터분할1 : x, y 나누기

In [8]:
target = 'medv'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

### (5) 데이터분할2 : train : validation 나누기

In [9]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() #선언
x_train_s1 = scaler.fit_transform(x_train) # fit_transform : fit + transform
x_val_s1 = scaler.transform(x_val) #적용만

In [11]:
from sklearn.preprocessing import StandardScaler

scaler2 = StandardScaler()
x_train_s2 = scaler2.fit_transform(x_train)
x_val_s2 = scaler2.transform(x_val)

In [12]:
x_train_s1 = pd.DataFrame(x_train_s1, columns = list(x))
x_train_s2 = pd.DataFrame(x_train_s2, columns = list(x))

In [13]:
x.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97


In [14]:
x_train.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,3.204705,11.478814,10.756073,0.070621,0.54853,6.309133,67.485311,3.900144,9.039548,397.79096,18.373164,12.129209
std,8.554879,22.969625,6.894126,0.256554,0.115102,0.689033,27.894307,2.113877,8.41344,163.905474,2.191546,6.739158
min,0.00632,0.0,0.46,0.0,0.389,3.863,6.0,1.1296,1.0,188.0,12.6,1.73
25%,0.071848,0.0,4.935,0.0,0.448,5.8895,42.95,2.16825,4.0,277.0,17.0,6.735
50%,0.219655,0.0,8.35,0.0,0.524,6.209,74.85,3.3618,5.0,329.5,18.85,10.925
75%,2.58828,20.0,18.1,0.0,0.614,6.61875,93.875,5.226975,8.0,437.0,20.2,15.82
max,88.9762,100.0,27.74,1.0,0.871,8.725,100.0,10.7103,24.0,711.0,22.0,37.97


In [15]:
x_train_s1.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,0.035949,0.114788,0.377422,0.070621,0.330976,0.503112,0.654099,0.28918,0.349546,0.40113,0.614166,0.286954
std,0.096155,0.229696,0.252717,0.256554,0.238801,0.141718,0.296748,0.220639,0.365802,0.313395,0.233143,0.185959
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000737,0.0,0.16404,0.0,0.122407,0.416804,0.393085,0.108411,0.130435,0.170172,0.468085,0.138107
50%,0.002398,0.0,0.289223,0.0,0.280083,0.482517,0.732447,0.232989,0.173913,0.270554,0.664894,0.253725
75%,0.029021,0.2,0.646628,0.0,0.466805,0.566794,0.93484,0.42767,0.304348,0.476099,0.808511,0.388797
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
x_train_s2.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,-1.3799380000000001e-17,7.02514e-17,8.969598e-17,-1.3328950000000002e-17,-3.048409e-16,-6.443371e-16,-1.79392e-16,6.711518000000001e-17,-1.1290400000000001e-17,-5.0179570000000006e-17,3.070363e-16,2.195356e-16
std,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415
min,-0.374396,-0.5004462,-1.49557,-0.2756589,-1.387952,-3.55512,-2.207344,-1.312501,-0.9569127,-1.281763,-2.638016,-1.545286
25%,-0.3667255,-0.5004462,-0.8455478,-0.2756589,-0.8746373,-0.6098789,-0.8808263,-0.8204568,-0.5998357,-0.7379981,-0.6274599,-0.8015607
50%,-0.3494235,-0.5004462,-0.3494974,-0.2756589,-0.2134189,-0.1455293,0.2643949,-0.2550316,-0.48081,-0.4172382,0.2178877,-0.1789413
75%,-0.07215742,0.3715014,1.066752,-0.2756589,0.5696029,0.4499863,0.9473991,0.6285651,-0.1237329,0.239556,0.834763,0.5484387
max,10.04022,3.859292,2.467023,3.627671,2.805565,3.511133,1.167289,3.226202,1.780678,1.913617,1.657263,3.839852


## 4.모델링 : KNN

### (1) import

In [17]:
# 모델링용
from sklearn.neighbors import KNeighborsRegressor    

# 회귀모델 평가용
from sklearn.metrics import * 

### (2) 모델선언

In [18]:
model = KNeighborsRegressor() # k : 기본값 5

In [19]:
x_train_s1.shape

(354, 12)

In [20]:
x_val_s1.shape

(152, 12)

### (3) 모델링(학습)

In [21]:
model.fit(x_train_s1,y_train)

KNeighborsRegressor()

### (4) 검증 : 예측

In [22]:
pred = model.predict(x_val_s1)
pred

array([19.5 , 13.88, 18.88, 30.2 , 14.92, 14.24, 24.06, 16.22,  8.72,
       19.08, 23.02, 22.82, 19.56, 16.2 , 19.66, 29.46, 10.56, 13.82,
       17.6 , 20.44, 21.8 , 19.68, 11.84, 34.54, 32.24, 43.84, 24.66,
       18.74, 19.34, 26.08, 23.  , 23.06, 22.02, 19.48, 11.2 , 13.14,
       14.34, 19.7 , 14.66, 32.44, 10.46, 11.9 , 18.38,  8.94, 14.24,
       29.42, 19.78, 12.5 , 14.76, 17.98, 22.4 , 11.18, 27.28, 29.24,
       24.02, 19.2 , 24.2 , 25.58, 25.46, 20.  , 14.8 , 14.02, 14.24,
       27.62, 21.8 , 18.52, 26.14, 20.16, 26.6 , 17.62, 13.3 , 23.26,
       11.32, 23.02, 24.98, 15.88, 15.3 , 10.28, 11.04, 24.86, 15.88,
       21.34, 18.6 , 33.82, 15.56, 11.82, 24.06, 21.46, 13.14, 34.8 ,
       21.34, 23.16, 23.16, 14.24, 14.1 , 24.18, 24.6 , 15.86, 13.38,
       22.64, 29.1 , 13.38, 40.92, 22.84, 21.42, 16.64, 41.16, 23.2 ,
       15.3 , 30.08, 15.7 , 12.48, 10.02, 21.2 , 24.72, 24.1 , 22.4 ,
       15.86, 18.92, 30.86, 31.34, 21.92, 24.66, 21.1 , 21.5 , 22.66,
        9.76, 21.66,

### (5) 검증 : 평가

In [23]:
# RMSE
mean_squared_error(y_val,pred,squared=False)

4.227564933050456

In [24]:
# MAE
mean_absolute_error(y_val,pred)

3.035

In [25]:
# MAPE : 평균 오차율
mean_absolute_percentage_error(y_val,pred)

0.16339021011462934

In [26]:
# 1 - MAPE : 정확도
1 - mean_absolute_percentage_error(y_val,pred)

0.8366097898853706

## 5.Hyper Parameter

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

* n_neighbors : k 의 갯수. k가 달라지면 예측결과도 달라지고, 성능도 달라집니다!
* metric : 거리계산 방식.
    * euclidean : 유클리디안 거리 :  sqrt(a^2 + b^2)
    * manhattan : 맨하탄거리 : a + b

In [27]:
# model1 : n_neighbors = 10, metric = 'euclidean'
model1 = KNeighborsRegressor(n_neighbors = 10, metric = 'euclidean')
model1.fit(x_train_s1, y_train)
pred1 = model1.predict(x_val_s1)

In [28]:
# model2 : n_neighbors = 10, metric = 'manhattan'
model2 = KNeighborsRegressor(n_neighbors = 10, metric = 'manhattan')
model2.fit(x_train_s1, y_train)
pred2 = model2.predict(x_val_s1)

In [29]:
print(f'metric = euclidean : rmse {mean_squared_error(y_val, pred1, squared = False)}')
print(f'metric = manhattan : rmse {mean_squared_error(y_val, pred2, squared = False)}')

metric = euclidean : rmse 4.841955074901916
metric = manhattan : rmse 4.420933680260671


In [30]:
print(f'metric = euclidean : mae {mean_absolute_error(y_val, pred1)}')
print(f'metric = manhattan : mae {mean_absolute_error(y_val, pred2)}')

metric = euclidean : mae 3.462763157894737
metric = manhattan : mae 3.2408552631578953


In [31]:
print(f'metric = euclidean : mape {mean_absolute_percentage_error(y_val, pred1)}')
print(f'metric = manhattan : mape {mean_absolute_percentage_error(y_val, pred2)}')

metric = euclidean : mape 0.18505949742155064
metric = manhattan : mape 0.17819962243785456


## 6.연습문제
* 다음의 조건을 조정하며 모델을 생성하고 성능을 비교해 봅시다.
* 조건
    * 스케일링 데이터 : 하이퍼파라미터는 default로 두고, 스케일링 데이터만 달리하며 비교해 봅시다.
        * 정규화 : x_train_s1, x_val_s1
        * 표준화 : x_train_s2, x_val_s2
    * k : 
        * k 값을 1에서 50까지 1씩 증가시켜가며 
        * 성능 rmse, mae, mape를 구하고 최적의 k 값을 찾아 봅시다.

### (1) 스케일링 데이터 비교.

In [32]:
# 정규화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_s = scaler.fit_transform(x_train_s1)
x_val_s = scaler.transform(x_val_s1)

In [33]:
# 표준화
from sklearn.preprocessing import MinMaxScaler
scaler2 = MinMaxScaler()
x_train_m = scaler2.fit_transform(x_train_s2)
x_val_m = scaler2.transform(x_val_s2)

In [35]:
for i in range(1,51):
    model1 = KNeighborsRegressor(n_neighbors=i,metric ='euclidean')
    model1.fit(x_train_s,y_train)
    pred1 = model1.predict(x_val_s)
    print(i)
    print(mean_squared_error(y_val,pred1,squared=False))
    print(mean_absolute_error(y_val, pred1))
    print(mean_absolute_percentage_error(y_val, pred1))
    print()

1
4.63010742173895
3.1039473684210526
0.18074016312196256

2
4.0323058082912
2.9444078947368424
0.1763761571857222

3
4.044070997635691
2.8313596491228066
0.17443802020335664

4
3.92634908092278
2.8565789473684204
0.17252783257960538

5
4.191138835309815
2.971710526315789
0.17378681288843553

6
4.233684929797602
2.9827850877192983
0.17141783812490544

7
4.3158879848467295
3.0534774436090224
0.1726398410302335

8
4.369443487425216
3.100904605263158
0.17451352578626841

9
4.292414581208583
3.081432748538012
0.1746013857619247

10
4.330149277237205
3.119407894736842
0.17609772232587223

11
4.2654264205275325
3.1316985645933015
0.17736639012692398

12
4.273866755425192
3.1201206140350877
0.17772604142268134

13
4.3180719186907925
3.144331983805668
0.18003479652485227

14
4.314213474411378
3.1380169172932333
0.1812596215967306

15
4.3525015997054615
3.161666666666667
0.18334006880079667

16
4.359811010312424
3.1608141447368414
0.18470359658480634

17
4.34232587752105
3.1542569659442723
0.18

In [37]:
for i in range(1,51):
    model1 = KNeighborsRegressor(n_neighbors=i,metric ='manhattan')
    model1.fit(x_train_s,y_train)
    pred1 = model1.predict(x_val_s)
    print(i)
    print(mean_squared_error(y_val,pred1,squared=False))
    print(mean_absolute_error(y_val, pred1))
    print(mean_absolute_percentage_error(y_val, pred1))
    print()

1
5.039625873749799
3.2927631578947367
0.18304369355511377

2
4.128276210924387
3.0276315789473687
0.1781161507748806

3
3.925124313597339
2.8651315789473686
0.1685252655580158

4
3.9497584869537476
2.838815789473684
0.16671307583290576

5
3.856097700225477
2.7005263157894737
0.15809548476908003

6
4.014418230338013
2.843530701754386
0.16356632260462067

7
4.04208426683479
2.9443609022556396
0.166707200893791

8
4.095895379989067
2.970230263157895
0.16564593594465185

9
4.116872346117865
2.9896929824561407
0.1680983411689843

10
4.174859279065583
3.047368421052632
0.17242277646921053

11
4.239932873790536
3.058433014354067
0.17405476018832022

12
4.251442125037557
3.0286732456140353
0.17265019743441967

13
4.2737075317159805
3.0304149797570847
0.17151680283963294

14
4.297557844141009
3.048825187969925
0.17284363679991538

15
4.376671243032843
3.088815789473684
0.1750714682756619

16
4.378641958049638
3.075534539473684
0.1752125931727578

17
4.475015121213982
3.1391640866873063
0.17763

In [40]:
for i in range(1,51):
    model2 = KNeighborsRegressor(n_neighbors=i,metric ='euclidean')
    model2.fit(x_train_m,y_train)
    pred2 = model2.predict(x_val_m)
    print(i)
    print(mean_squared_error(y_val,pred2,squared=False))
    print(mean_absolute_error(y_val, pred2))
    print(mean_absolute_percentage_error(y_val, pred2))
    print()

1
5.1797746852033715
3.4690789473684207
0.18525312449754464

2
4.281481697533669
3.0657894736842106
0.1644239146778844

3
3.899292370950407
2.8510964912280703
0.16187632686472572

4
4.104204960191822
2.901644736842105
0.15867625614103983

5
4.227564933050456
3.035
0.16339021011462934

6
4.433874126098064
3.1846491228070177
0.17040828142291925

7
4.48123878789268
3.27312030075188
0.1753946599415002

8
4.637504211588384
3.3605263157894742
0.17904467400192423

9
4.806153581421751
3.459137426900585
0.18363394496559407

10
4.841955074901916
3.462763157894737
0.18505949742155064

11
4.765400562228352
3.4440789473684212
0.18541955614486316

12
4.832377259115352
3.4795504385964913
0.1884393125851166

13
4.83889051476573
3.491902834008097
0.19000245813746786

14
4.789771243604228
3.4640037593984965
0.1890517723748572

15
4.731427433191335
3.4026754385964915
0.1877927358078493

16
4.741629896008722
3.3965460526315794
0.1888731480497419

17
4.8393650520207805
3.452089783281734
0.19149517044080236

In [41]:
for i in range(1,51):
    model2 = KNeighborsRegressor(n_neighbors=i,metric ='manhattan')
    model2.fit(x_train_m,y_train)
    pred2 = model2.predict(x_val_m)
    print(i)
    print(mean_squared_error(y_val,pred2,squared=False))
    print(mean_absolute_error(y_val, pred2))
    print(mean_absolute_percentage_error(y_val, pred2))
    print()

1
5.106626242339199
3.369736842105263
0.1816740693126233

2
4.00083667236584
2.9200657894736843
0.16010740643994345

3
4.02615081074681
2.8958333333333335
0.15436311279135345

4
3.962025747810808
2.792434210526316
0.1506956239997825

5
4.035930402091483
2.855394736842106
0.15260936931135793

6
4.183659428472912
3.0031798245614034
0.16303887605509967

7
4.221406943699924
3.07218045112782
0.16822166013863515

8
4.341930232217853
3.1263157894736846
0.1706481064758998

9
4.360413025664652
3.1671052631578953
0.1734508274903896

10
4.420933680260671
3.2408552631578953
0.17819962243785456

11
4.4940013664730545
3.2294258373205746
0.1770682246373557

12
4.532984216433226
3.225219298245614
0.1767781825639071

13
4.563963083927649
3.2232287449392714
0.17802645631874534

14
4.60748501527141
3.2015507518796995
0.1777593429992249

15
4.642175048539913
3.2147807017543863
0.1772621762426233

16
4.68740252091626
3.2569901315789473
0.17921664760191677

17
4.773908691814097
3.3037538699690403
0.18199465

### (2) k 값