# Regression dengan KNN (K Nearest Neighbours)

In [2]:
import pandas as pd

sensus = {'tinggi': [158,170,183,191,155,163,180,158,178],
          'jk': ['pria', 'pria', 'pria', 'pria', 'wanita', 'wanita', 'wanita', 'wanita', 'wanita'],
          'berat': [64,86,84,80,49,59,67,54,67]}

sensus_df = pd.DataFrame(sensus)
sensus_df

Unnamed: 0,tinggi,jk,berat
0,158,pria,64
1,170,pria,86
2,183,pria,84
3,191,pria,80
4,155,wanita,49
5,163,wanita,59
6,180,wanita,67
7,158,wanita,54
8,178,wanita,67


In [3]:
import numpy as np

x_train = np.array(sensus_df [['tinggi', 'jk']])
y_train = np.array(sensus_df ['berat'])

print(f'x train: {x_train}')
print(f'y train: {y_train}')

x train: [[158 'pria']
 [170 'pria']
 [183 'pria']
 [191 'pria']
 [155 'wanita']
 [163 'wanita']
 [180 'wanita']
 [158 'wanita']
 [178 'wanita']]
y train: [64 86 84 80 49 59 67 54 67]


In [4]:
x_train_transposed = np.transpose(x_train)

print(f'x train: {x_train}')
print(f'x train transposed: {x_train_transposed}')

x train: [[158 'pria']
 [170 'pria']
 [183 'pria']
 [191 'pria']
 [155 'wanita']
 [163 'wanita']
 [180 'wanita']
 [158 'wanita']
 [178 'wanita']]
x train transposed: [[158 170 183 191 155 163 180 158 178]
 ['pria' 'pria' 'pria' 'pria' 'wanita' 'wanita' 'wanita' 'wanita'
  'wanita']]


In [5]:
from sklearn.preprocessing import LabelBinarizer

lb= LabelBinarizer()
jk_binarised = lb.fit_transform(x_train_transposed[1])

print(f'jk: {x_train_transposed[1]}')
print(f'jk_binarised: {jk_binarised}')

jk: ['pria' 'pria' 'pria' 'pria' 'wanita' 'wanita' 'wanita' 'wanita' 'wanita']
jk_binarised: [[0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]]


In [6]:
jk_binarised = jk_binarised.flatten()
jk_binarised

array([0, 0, 0, 0, 1, 1, 1, 1, 1])

In [7]:
x_train_transposed[1] = jk_binarised
x_tain = x_train_transposed.transpose()

print(f'x_train_transposed: {x_train_transposed}')
print(f'x_train: {x_train}')

x_train_transposed: [[158 170 183 191 155 163 180 158 178]
 [0 0 0 0 1 1 1 1 1]]
x_train: [[158 0]
 [170 0]
 [183 0]
 [191 0]
 [155 1]
 [163 1]
 [180 1]
 [158 1]
 [178 1]]


In [8]:
# training knn regression model

from sklearn.neighbors import KNeighborsRegressor as KNN

k=3
model = KNN(n_neighbors=k)
model.fit(x_train, y_train)

In [9]:
x_new = np.array([[155,1]])
x_new

array([[155,   1]])

In [10]:
y_pred = model.predict(x_new)
y_pred

array([55.66666667])

In [11]:
x_test = np.array([[168,0],[180,0],[160,1],[169,1]])
y_test = np.array([65,96,52,67])

print(f'x test: {x_test}')
print(f'y test: {y_test}')

x test: [[168   0]
 [180   0]
 [160   1]
 [169   1]]
y test: [65 96 52 67]


In [12]:
y_pred = model.predict(x_test)
y_pred

array([69.66666667, 72.66666667, 59.        , 70.66666667])

In [13]:
# r square

from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)

print(f'r squared: {r_squared}')

r squared: 0.39200515796260493


In [15]:
# mean absolute error (MAE) atau mean absolute deviation (MAD)

from sklearn.metrics import mean_absolute_error

MAE = mean_absolute_error (y_test, y_pred)

print(f'MAE: {MAE}')

MAE: 9.666666666666668


In [16]:
# mean squared error (MSE) atau mean squared deviation (MSD)

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, y_pred)

print(f'MSE: {MSE}')

MSE: 157.16666666666663


In [17]:
# permasalahan scaling pada features

from scipy.spatial.distance import euclidean

x_train = np.array([[1700,0],[1600,1]])
x_new = np.array([[1640,0]])

[euclidean(x_new[0],d) for d in x_train]

[60.0, 40.01249804748511]

In [18]:
x_train = np.array([[1.7,0],[1.6,1]])
x_new = np.array([[1.64,0]])

[euclidean(x_new[0],d) for d in x_train]

[0.06000000000000005, 1.0007996802557442]

In [19]:
# menerapkan standar scaler (standard score atau z score)

from sklearn.preprocessing import StandardScaler

ss= StandardScaler()

In [21]:
x_train = np.array([[1700,0],[1600,1]])
x_train_scaled  = ss.fit_transform(x_train)

print(f'x_train_scaled: {x_train_scaled}')


x_new = np.array([[1640,0]])
x_new_scaled = ss.transform(x_new)

print(f'x_new_scaled: {x_new_scaled}')

jarak = [euclidean(x_new_scaled[0],d) for d in x_train_scaled]

print(f'jarak: {jarak}')

x_train_scaled: [[ 1. -1.]
 [-1.  1.]]
x_new_scaled: [[-0.2 -1. ]]
jarak: [1.2, 2.1540659228538015]


In [22]:
x_train = np.array([[1.7,0],[1.6,1]])
x_train_scaled  = ss.fit_transform(x_train)

print(f'x_train_scaled: {x_train_scaled}')


x_new = np.array([[1.64,0]])
x_new_scaled = ss.transform(x_new)

print(f'x_new_scaled: {x_new_scaled}')

jarak = [euclidean(x_new_scaled[0],d) for d in x_train_scaled]

print(f'jarak: {jarak}')

x_train_scaled: [[ 1. -1.]
 [-1.  1.]]
x_new_scaled: [[-0.2 -1. ]]
jarak: [1.2000000000000026, 2.1540659228538006]


In [23]:
# menerapkan features scaling pada KNN

x_train = np.array([[158,0],[170,0],[183,0],[191,0],[155,1],[163,1],[180,1],[158,1],[170,1]])

y_train = np.array([64,86,84,80,49,59,67,54,67])

x_test = np.array([[168,0],[180,0],[160,1],[169,1]])
y_test = np.array([65,96,52,67])

In [24]:
x_train_scaled = ss.fit_transform(x_train)
x_test_scaled = ss.transform(x_test)

print(f'x train scaled: {x_train_scaled}')
print(f'x test scaled: {x_test_scaled}')

x train scaled: [[-0.9908706  -1.11803399]
 [ 0.01869567 -1.11803399]
 [ 1.11239246 -1.11803399]
 [ 1.78543664 -1.11803399]
 [-1.24326216  0.89442719]
 [-0.57021798  0.89442719]
 [ 0.86000089  0.89442719]
 [-0.9908706   0.89442719]
 [ 0.01869567  0.89442719]]
x test scaled: [[-0.14956537 -1.11803399]
 [ 0.86000089 -1.11803399]
 [-0.82260955  0.89442719]
 [-0.06543485  0.89442719]]


In [26]:
# training dan evaluasi model

model.fit(x_train_scaled, y_train)
y_pred = model.predict(x_test_scaled)

MAE = mean_absolute_error (y_test, y_pred)
MSE = mean_squared_error (y_test, y_pred)

print (f'MAE: {MAE}')
print (f'MSE: {MSE}')

MAE: 7.583333333333336
MSE: 85.13888888888893
