In [2]:
import pandas as pd 
import numpy as np

### I. With Binary Encoding: 

In [248]:
age = np.random.randint(20,80,10)
income = np.random.randint(20000,220000,10)
gender = np.random.randint(0,2,10)
# 0 means female 1 means male #

In [249]:
pd.DataFrame({'Age':age,'Income':income, 'Gender':gender})

Unnamed: 0,Age,Income,Gender
0,52,41390,1
1,69,192944,0
2,39,150332,1
3,79,31149,0
4,33,203711,1
5,51,67934,0
6,36,189924,1
7,24,148507,0
8,23,181548,1
9,70,32974,0


In [250]:
np.random.seed = 42
age = np.random.randint(20,80,10)
np.random.seed = 42
income = np.random.randint(20000,220000,10)
np.random.seed = 42
gender = np.random.randint(0,2,10)
# 1 means female 0 means male #

In [251]:
pd.DataFrame({'Age':age,'Income':income, 'Gender':gender})

Unnamed: 0,Age,Income,Gender
0,22,214452,0
1,61,123477,0
2,66,169886,0
3,27,98801,0
4,48,154661,1
5,25,186562,1
6,21,207074,1
7,20,213925,0
8,42,122532,1
9,43,54034,1


In [252]:
new_entry_age = np.random.randint(20,80,1)
new_entry_income = np.random.randint(20000,220000,1)
new_entry_gender = np.random.randint(0,2,1)

In [253]:
pd.DataFrame({'Age':new_entry_age,'Income':new_entry_income, 'Gender':new_entry_gender})

Unnamed: 0,Age,Income,Gender
0,38,60641,0


Normalization: 

In [254]:
age_min = np.min(age)
age_max = np.max(age)


income_min = np.min(income)
income_max = np.max(income)


In [255]:
age_norm = (age - age_min)/(age_max - age_min)

In [256]:
income_norm = (income - income_min)/(income_max - income_min)

In [257]:
pd.DataFrame({'Age':age_norm,'Income':income_norm, 'Gender':gender})

Unnamed: 0,Age,Income,Gender
0,0.043478,1.0,0
1,0.891304,0.432888,0
2,1.0,0.722188,0
3,0.152174,0.279065,0
4,0.608696,0.62728,1
5,0.108696,0.826142,1
6,0.021739,0.954008,1
7,0.0,0.996715,0
8,0.478261,0.426997,1
9,0.5,0.0,1


__Comments__: 

- Since binary variable has value either 1 or 0 (no value in between) it may have slightly more influence over other features in KNN. According to what I have read normalizing binary variable doesn't make sense. Please correct me if my understanding is incorrect. Thanks.

In [258]:
new_entry_age_norm = (new_entry_age - age_min)/(age_max - age_min)
new_entry_income_norm = (new_entry_income - income_min)/(income_max - income_min)

In [259]:
print('Normalized Age:{} & Normalized Income: {}'.format(new_entry_age_norm,new_entry_income_norm))

Normalized Age:[0.39130435] & Normalized Income: [0.04118615]


Calculate distance using Euclidean distance formula (including gender): 

In [263]:
distances_norm = np.sqrt((new_entry_age_norm - age_norm)**2 + (new_entry_income_norm - income_norm)**2 + (new_entry_gender-gender)**2)

In [264]:
matrix_norm = pd.DataFrame({'Age':age, 'Income':income,'Gender':gender, 'distance':distances_norm})

In [265]:
matrix_norm.sort_values(by=['distance'])

Unnamed: 0,Age,Income,Gender,distance
3,27,98801,0,0.337297
1,61,123477,0,0.635162
2,66,169886,0,0.913386
9,43,54034,1,1.006733
0,22,214452,0,1.019954
7,20,213925,0,1.032547
8,42,122532,1,1.075366
4,48,154661,1,1.179307
5,25,186562,1,1.302314
6,21,207074,1,1.403503


In [266]:
pd.DataFrame({'Age':new_entry_age,'Income':new_entry_income, 'Gender':new_entry_gender})

Unnamed: 0,Age,Income,Gender
0,38,60641,0


Calculate distance using Euclidean distance formula (excluding gender): 

In [267]:
distances_norm = np.sqrt((new_entry_age_norm - age_norm)**2 + (new_entry_income_norm - income_norm)**2)

In [268]:
matrix_norm = pd.DataFrame({'Age':age, 'Income':income,'Gender':gender, 'distance':distances_norm})

In [269]:
matrix_norm.sort_values(by=['distance'])

Unnamed: 0,Age,Income,Gender,distance
9,43,54034,1,0.116237
3,27,98801,0,0.337297
8,42,122532,1,0.395489
4,48,154661,1,0.625112
1,61,123477,0,0.635162
5,25,186562,1,0.83428
2,66,169886,0,0.913386
6,21,207074,1,0.984795
0,22,214452,0,1.019954
7,20,213925,0,1.032547


In [266]:
pd.DataFrame({'Age':new_entry_age,'Income':new_entry_income, 'Gender':new_entry_gender})

Unnamed: 0,Age,Income,Gender
0,38,60641,0


__Observations__: 

- When gender was included in feature set (binary encoding) the KNN gave more preference to gender hence the results weren't as good as they are without gender. 
- Seems like binary encoding is not correct way. Please correct me if I am have incorrect understanding. 

---------------------------

### II. With One-Hot Encoding: 

In [270]:
gender_F = np.array([0,0,0,1,1,1,0,1,0,1])
gender_M = np.array([1,1,1,0,0,0,1,0,1,0])

In [271]:
pd.DataFrame({'Age':age,'Income':income, 'Female':gender_F, 'Male':gender_M})

Unnamed: 0,Age,Income,Female,Male
0,22,214452,0,1
1,61,123477,0,1
2,66,169886,0,1
3,27,98801,1,0
4,48,154661,1,0
5,25,186562,1,0
6,21,207074,0,1
7,20,213925,1,0
8,42,122532,0,1
9,43,54034,1,0


In [272]:
new_entry_gender_F = np.array([0])
new_entry_gender_M = np.array([1])

In [273]:
pd.DataFrame({'Age':new_entry_age,'Income':new_entry_income, 'Female':new_entry_gender_F, 'Male':new_entry_gender_M})

Unnamed: 0,Age,Income,Female,Male
0,38,60641,0,1


Normalization: 

In [274]:
age_min = np.min(age)
age_max = np.max(age)


income_min = np.min(income)
income_max = np.max(income)


In [275]:
age_norm = (age - age_min)/(age_max - age_min)

In [276]:
income_norm = (income - income_min)/(income_max - income_min)

In [277]:
pd.DataFrame({'Age':age_norm,'Income':income_norm, 'Female':gender_F, 'Male':gender_M})

Unnamed: 0,Age,Income,Female,Male
0,0.043478,1.0,0,1
1,0.891304,0.432888,0,1
2,1.0,0.722188,0,1
3,0.152174,0.279065,1,0
4,0.608696,0.62728,1,0
5,0.108696,0.826142,1,0
6,0.021739,0.954008,0,1
7,0.0,0.996715,1,0
8,0.478261,0.426997,0,1
9,0.5,0.0,1,0


In [278]:
new_entry_age_norm = (new_entry_age - age_min)/(age_max - age_min)
new_entry_income_norm = (new_entry_income - income_min)/(income_max - income_min)

In [279]:
print('Normalized Age:{} & Normalized Income: {}'.format(new_entry_age_norm,new_entry_income_norm))

Normalized Age:[0.39130435] & Normalized Income: [0.04118615]


Calculate distance using Euclidean distance formula: 

In [280]:
distances_norm = np.sqrt((new_entry_age_norm - age_norm)**2 + (new_entry_income_norm - income_norm)**2 
                         + (new_entry_gender_F- gender_F)**2 + (new_entry_gender_M- gender_M)**2)

In [281]:
matrix_norm = pd.DataFrame({'Age':age, 'Income':income,'Female':gender_F,'Male':gender_M, 'distance':distances_norm})

In [282]:
matrix_norm.sort_values(by=['distance'])

Unnamed: 0,Age,Income,Female,Male,distance
8,42,122532,0,1,0.395489
1,61,123477,0,1,0.635162
2,66,169886,0,1,0.913386
6,21,207074,0,1,0.984795
0,22,214452,0,1,1.019954
9,43,54034,1,0,1.418982
3,27,98801,1,0,1.453881
4,48,154661,1,0,1.54621
5,25,186562,1,0,1.641957
7,20,213925,1,0,1.751044


In [283]:
pd.DataFrame({'Age':new_entry_age,'Income':new_entry_income, 'Female':new_entry_gender_F, 'Male':new_entry_gender_M})

Unnamed: 0,Age,Income,Female,Male
0,38,60641,0,1


__Observations__: 

- Again KNN gave more preference to gender hence the results aren't good in terms of income & gender. 
- Maybe normalization of one hot encoding variables will improve the results. 

---------

### III. With One-Hot Encoding (normalize): 

In [284]:
gender_F = np.array([0,0,0,1,1,1,0,1,0,1])
gender_M = np.array([1,1,1,0,0,0,1,0,1,0])

In [286]:
pd.DataFrame({'Age':age,'Income':income, 'Female':gender_F, 'Male':gender_M})

Unnamed: 0,Age,Income,Female,Male
0,22,214452,0,1
1,61,123477,0,1
2,66,169886,0,1
3,27,98801,1,0
4,48,154661,1,0
5,25,186562,1,0
6,21,207074,0,1
7,20,213925,1,0
8,42,122532,0,1
9,43,54034,1,0


In [287]:
new_entry_gender_F = np.array([0])
new_entry_gender_M = np.array([1])

In [288]:
pd.DataFrame({'Age':new_entry_age,'Income':new_entry_income, 'Female':new_entry_gender_F, 'Male':new_entry_gender_M})

Unnamed: 0,Age,Income,Female,Male
0,38,60641,0,1


Normalization: 

In [289]:
age_min = np.min(age)
age_max = np.max(age)


income_min = np.min(income)
income_max = np.max(income)

female_min = np.min(gender_F)
female_max = np.max(gender_F)

male_min = np.min(gender_M)
male_max = np.max(gender_M)


In [290]:
age_norm = (age - age_min)/(age_max - age_min)

In [291]:
income_norm = (income - income_min)/(income_max - income_min)

In [292]:
female_norm = (gender_F - female_min)/(female_max - female_min)

In [293]:
female_norm

array([0., 0., 0., 1., 1., 1., 0., 1., 0., 1.])

__Observations__: 

- Normalization of one hot encoded variable won't help because these are already on extreme ends (0 & 1) we can't have values in between by normalization. 