## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, jaccard_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def tr_te(df: pd.DataFrame, ts: float = 0.2, rs: int = 123, print_: bool = True):
    xs = df.drop(["income"], axis=1)
    ys = df["income"]

    xtr, xte, ytr, yte = train_test_split(xs, ys, test_size=ts, random_state=rs)
    return xtr, xte, ytr, yte

In [4]:
def filter_corr(df: pd.DataFrame, min_c: float):
    corr_df = df.copy()
    corr_list = df.corr()["income"]
    for col in df.columns:
        if corr_list[col] <= min_c:
            corr_df.drop([col], axis=1, inplace=True)
    return corr_df

In [5]:
def filter_class(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    length: int = len(df)
    this_col_vals: dict = dict()
    this_col_uniques: list
    for col in df.columns:
        this_col_uniques = list(df[col].unique())
        
        for val in this_col_uniques:
            if (df[df[col] == val]["income"].sum()) > \
               (len(df[df[col] == val]) / 2):
                this_col_vals[val] = 1
            else:
                this_col_vals[val] = 0
        
        for i in range(length):
            df[col][i] = this_col_vals.get(df[col][i], 0)
            
        this_col_vals = dict()
    
    return df

In [6]:
def evaluate(p: list, df: pd.DataFrame
             , model: KNeighborsClassifier
             , trte_func, print_res: bool = False):
    xtrain, xtest, ytrain, ytest = trte_func(df)
    model.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    if print_res:
        print(f"{p}: {accuracy_score(ytest, y_pred):.2%}")
        print(f"f1score: {f1_score(ytest, y_pred)}")
    return (accuracy_score(ytest, y_pred), f1_score(ytest, y_pred))

In [7]:
def train_evaluate(classified_df: pd.DataFrame, model: KNeighborsClassifier,
                   min_corr: float, evaluate_func, filter_func,
                   trte_func, ps: list, output: bool):
    global data, data_, corr_list
    for col in data_.columns:
        if corr_list[col] < min_corr:
            data_[col] = classified_df[col]
    
    return evaluate_func([ps, min_corr], filter_func(data_, min_corr),
                  model, trte_func, output)

## Read Data

In [8]:
data = pd.read_csv("../adult.csv")

In [9]:
le = LabelEncoder()

In [10]:
for col in data.columns:
    if data[col].dtype == "object":
       data[col] = le.fit_transform(data[col].values)

In [11]:
length: int = len(data)

this_col_vals: np.ndarray = np.zeros(length, dtype=np.uint8)
for col in ["workclass", "race"]:
    for i in range(length):
        if data[col][i] == 4:
            this_col_vals[i] = 0
            continue
        this_col_vals[i] = 1
    this_col_vals: np.ndarray = np.zeros(length, dtype=np.uint8)
    data[col] = this_col_vals

In [12]:
data_ = data.copy()
corr_list = data.corr()["income"]
classified_data = filter_class(data)

## Train model and find the best model

In [13]:
max_acc: list = [0, []]
max_f1: list = [0, []]
max_acc_f1: list = [0, []]

acc: float = 0.0
f1: float = 0.0

In [14]:
ss: list = []
for n in range(3, 15):
    for w in ["uniform", "distance"]:
        for a in ["auto", "brute"]:
            ss.append([n, w, a])

In [15]:
for ps in ss:
    [n, w, a] = ps
    for min_corr in [0, 0.1, 0.3]:
        knn_model = KNeighborsClassifier(n_neighbors=n, weights=w,
        algorithm=a, p=2, metric="minkowski")
        try:
            acc, f1 = train_evaluate(classified_data, knn_model,
                        min_corr, evaluate, filter_corr
                        , tr_te, [n, w, a], False)
            if acc > max_acc[0]:
                max_acc[0] = acc
                max_acc[1] = [n, w, a, min_corr]
            if f1 > max_f1[0]:
                max_f1[0] = f1
                max_f1[1] = [n, w, a, min_corr]
            if (acc + f1) > (max_acc_f1[0]):
                max_acc_f1[0] = acc + f1
                max_acc_f1[1] = [n, w, a, min_corr]
        except:
            continue

In [16]:
print (max_acc)

[0.9135018937455216, [5, 'distance', 'brute', 0.1]]


In [17]:
print (max_f1)

[0.8033472803347281, [13, 'distance', 'brute', 0]]


In [18]:
print (max_acc_f1)

[1.7167468094574634, [13, 'distance', 'brute', 0]]


### Best model

In [19]:
for col in data.columns:
    if corr_list[col] < min_corr:
        data[col] = classified_data[col]

In [20]:
knn_model = KNeighborsClassifier(n_neighbors=13,
            weights="distance", algorithm="brute")

In [21]:
x_train, x_test, y_train, y_test = tr_te(filter_corr(data_, 0.0))

In [22]:
knn_model.fit(x_train, y_train)

In [23]:
y_pred = knn_model.predict(x_test)

In [24]:
print (f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
print (f"F1 Score: {f1_score(y_test, y_pred)}")
print (f"Jaccard Score: {jaccard_score(y_test, y_pred)}")

Accuracy Score: 0.9133995291227351
F1 Score: 0.8033472803347281
Jaccard Score: 0.6713286713286714


In [25]:
print (classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94      7819
           1       0.73      0.89      0.80      1950

    accuracy                           0.91      9769
   macro avg       0.85      0.90      0.87      9769
weighted avg       0.92      0.91      0.92      9769

