In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from collections import Counter

df = pd.read_csv("adult.csv")
df = df.replace(' ?', np.nan).dropna()

df = df.sample(1000, random_state=42).reset_index(drop=True)

In [2]:
df.replace({'education': {
    'Preschool': 'dropout',
    '10th': 'dropout',
    '11th': 'dropout',
    '12th': 'dropout',
    '1st-4th': 'dropout',
    '5th-6th': 'dropout',
    '7th-8th': 'dropout',
    '9th': 'dropout',
    'HS-Grad': 'HighGrad',
    'HS-grad': 'HighGrad',
    'Some-college': 'CommunityCollege',
    'Assoc-acdm': 'CommunityCollege',
    'Assoc-voc': 'CommunityCollege',
    'Bachelors': 'Bachelors',
    'Masters': 'Masters',
    'Prof-school': 'Masters',
    'Doctorate': 'Doctorate'
}}, inplace=True)

df.replace({'marital-status': {
    'Never-married': 'NotMarried',
    'Married-AF-spouse': 'Married',
    'Married-civ-spouse': 'Married',
    'Married-spouse-absent': 'NotMarried',
    'Separated': 'Separated',
    'Divorced': 'Separated',
    'Widowed': 'Widowed'
}}, inplace=True)


In [None]:
cont_cols = ['age', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
target_col = 'income'

In [4]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(df[cat_cols])

scaler = MinMaxScaler()
X_cont = scaler.fit_transform(df[cont_cols])

X = np.hstack([X_cont, X_cat])
y = (df[target_col] == ' >50K').astype(int).values  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_features = X_cont.shape[1]

In [5]:
def mixed_distance(x1, x2, alpha=1.0, beta=1.0, split=num_features):
    num_dist = np.linalg.norm(x1[:split] - x2[:split])  # Euclidean
    cat_dist = np.sum(x1[split:] != x2[split:])          # Hamming 
    return alpha * num_dist + beta * cat_dist


In [6]:
def knn_predict(X_train, y_train, x_test, k=5, alpha=1.0, beta=1.0):
    distances = []
    for i in range(len(X_train)):
        dist = mixed_distance(X_train[i], x_test, alpha, beta)
        distances.append((dist, y_train[i]))
    distances.sort(key=lambda x: x[0])
    k_nearest_labels = [label for _, label in distances[:k]]
    return Counter(k_nearest_labels).most_common(1)[0][0]


In [7]:
k = 5
alpha = 0.9
beta = 0.8

y_pred = [knn_predict(X_train, y_train, x, k, alpha, beta) for x in X_test]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)

print(f"KNN (k={k}, α={alpha}, β={beta})")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")


KNN (k=5, α=0.9, β=0.8)
Accuracy: 1.0000
Precision: 0.0000
Recall: 0.0000
