In [16]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

np.random.seed(0)

n = 100

# 4 features with very different scales
salary = np.random.normal(120_000, 50_000, n)      # huge scale
kids = np.random.randint(0, 6, n)                  # tiny scale
experience = np.random.randint(1, 21, n)           # small scale
credit_score = np.random.normal(680, 70, n)        # medium scale

X = np.column_stack([salary, kids, experience, credit_score])

# Label depends on *multiple* features
# (not just salary)
score = (
    0.000015 * salary
    + 0.4 * experience
    + 0.005 * credit_score
    - 0.8 * kids
    - 8)

y = (score > 0).astype(int)

#  456  ==> 1
#  123456789 ==> 4
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# predict (x_test) == y_pred ? y_test

print("Feature ranges:")
print("salary:", X[:,0].min(), X[:,0].max())
print("kids:", X[:,1].min(), X[:,1].max())
print("experience:", X[:,2].min(), X[:,2].max())
print("credit_score:", X[:,3].min(), X[:,3].max())

Feature ranges:
salary: -7649.490791703938 233487.7311993804
kids: 0.0 5.0
experience: 1.0 20.0
credit_score: 433.9662316628645 897.1071408867656


In [4]:
knn_no_scale = KNeighborsClassifier(n_neighbors=5)
knn_no_scale.fit(X_train, y_train)

y_pred = knn_no_scale.predict(X_test)
print('No scale:')
print(f"Accuracy: {accuracy_score(y_pred, y_test) * 100:.2f}%")

No scale:
Accuracy: 50.00%


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

scalar = StandardScaler()  # StandardScaler!!! -------------------------------------

x_train_scaled = scalar.fit_transform(X_train)

knn_scaled = KNeighborsClassifier(n_neighbors=5)
knn_scaled.fit(x_train_scaled, y_train)

X_test_scaled = scalar.transform(X_test)
y_pred_for_scaled = knn_scaled.predict(X_test_scaled)
print(x_train_scaled[:3])
print('Scale:')
print(f"Accuracy: {accuracy_score(y_pred_for_scaled, y_test) * 100:.2f}%")


[[ 1.28175687 -1.43564757  1.12670697 -0.17589024]
 [ 0.61585324  1.46884173  1.30042831 -1.26490577]
 [ 1.71627343 -1.43564757  0.43182161  0.35674893]]
Scale:
Accuracy: 93.33%


In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

scalar = MinMaxScaler()  # MinMaxScaler !! ----------------------------------------------

x_train_scaled = scalar.fit_transform(X_train)

knn_scaled = KNeighborsClassifier(n_neighbors=5)
knn_scaled.fit(x_train_scaled, y_train)

X_test_scaled = scalar.transform(X_test)
y_pred_for_scaled = knn_scaled.predict(X_test_scaled)
print(x_train_scaled[:3])
print('Scale:')
print(f"Accuracy: {accuracy_score(y_pred_for_scaled, y_test) * 100:.2f}%")


[[0.83090932 0.         0.89473684 0.49228496]
 [0.69057778 1.         0.94736842 0.32096117]
 [0.92247869 0.         0.68421053 0.57607969]]
Scale:
Accuracy: 93.33%


In [10]:
from sklearn.pipeline import Pipeline
knn_scaled_pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=5))])

knn_scaled_pipeline.fit(X_train, y_train)
y_pred = knn_scaled_pipeline.predict(X_test)

print('Pipeline Scale:')
print(f"Accuracy: {accuracy_score(y_pred, y_test) * 100:.2f}%")


Pipeline Scale:
Accuracy: 93.33%


In [11]:
import joblib 

joblib.dump(knn_scaled_pipeline, 'knn_scaled_pipeline.joblib')

['knn_scaled_pipeline.joblib']

In [14]:
import joblib 

loaded_model = joblib.load("knn_scaled_pipeline.joblib")

In [15]:
import numpy as np

sample = np.array([ [ 120_000, 2, 10, 720] ])
prediction = loaded_model.predict(sample)
print('Prediction [0/1]:', prediction[0])

Prediction [0/1]: 0


In [23]:
### Cross validation

np.random.seed(0)

n = 100

# 4 features with very different scales
salary = np.random.normal(120_000, 50_000, n)      # huge scale
kids = np.random.randint(0, 6, n)                  # tiny scale
experience = np.random.randint(1, 21, n)           # small scale
credit_score = np.random.normal(680, 70, n)        # medium scale

X = np.column_stack([salary, kids, experience, credit_score])

# Label depends on *multiple* features
# (not just salary)
score = (
    0.000015 * salary
    + 0.4 * experience
    + 0.005 * credit_score
    - 0.8 * kids
    - 8)

y = (score > 0).astype(int)

from sklearn.model_selection import cross_val_score

scalar = MinMaxScaler()  # MinMaxScaler !! ----------------------------------------------

x_train_scaled = scalar.fit_transform(X)

k = 3
model = KNeighborsClassifier(n_neighbors=k)
scores_scaled = cross_val_score(model, x_train_scaled, y, cv=10, scoring='accuracy')  # 10-fold CV
scores_non_scaled = cross_val_score(model, X, y, cv=10, scoring='accuracy')  # 10-fold CV

print('mean score non-scaled', scores_non_scaled.mean())
print('mean score scaled    ', scores.mean())

mean score non-scaled 0.5
mean score scaled     0.96
