In [94]:
import pandas as pd
from scipy.io import arff
import numpy as np
dataarff = arff.loadarff("phpMawTba.arff")
adult_census = pd.DataFrame(dataarff[0])
adult_census = adult_census[["age", "capital-gain", "capital-loss", "hours-per-week", "class"]]
adult_census.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
0,25.0,0.0,0.0,40.0,b'<=50K'
1,38.0,0.0,0.0,50.0,b'<=50K'
2,28.0,0.0,0.0,40.0,b'>50K'
3,44.0,7688.0,0.0,40.0,b'>50K'
4,18.0,0.0,0.0,30.0,b'<=50K'


In [95]:
target_name = "class"
adult_census[target_name] = adult_census[target_name].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
target = adult_census[target_name]
target

0        <=50K
1        <=50K
2         >50K
3         >50K
4        <=50K
         ...  
48837    <=50K
48838     >50K
48839    <=50K
48840    <=50K
48841     >50K
Name: class, Length: 48842, dtype: object

In [96]:
data = adult_census.drop(columns=[target_name])
data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,25.0,0.0,0.0,40.0
1,38.0,0.0,0.0,50.0
2,28.0,0.0,0.0,40.0
3,44.0,7688.0,0.0,40.0
4,18.0,0.0,0.0,30.0


In [97]:
data.columns

Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')

In [98]:
data = data.dropna(axis=0, how="any")

In [99]:
print(
    f"The dataset contains {data.shape[0]} samples and "
    f"{data.shape[1]} features"
)

The dataset contains 48842 samples and 4 features


In [100]:
target.value_counts()

class
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [109]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score 

model = KNeighborsClassifier(n_neighbors=50)
scores = []

##Others include ShuffleSplit and KFold, each which produce a list of indexes
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.33, random_state=42)
for train_index, test_index in sss.split(data, target):
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, pred))

print(scores)


[0.8265293460727138, 0.8258468792654176, 0.8214418662365057]


In [110]:
target_predicted = model.predict(X_test)

In [111]:
target_predicted[:5]

array(['<=50K', '<=50K', '<=50K', '<=50K', '<=50K'], dtype=object)

In [112]:
y_test[:5]

36253    <=50K
24236    <=50K
20085    <=50K
5000      >50K
325      <=50K
Name: class, dtype: object

In [113]:
y_test[:5]==target_predicted[:5]

36253     True
24236     True
20085     True
5000     False
325       True
Name: class, dtype: bool

In [114]:
print(
    "Number of correct predictions: " 
    f"{(y_test[:5] == target_predicted[:5]).sum()} / 5"
)

Number of correct predictions: 4 / 5


In [115]:
(y_test==target_predicted).mean()

0.8214418662365057

In [116]:
accuracy = model.score(X_test, y_test)
model_name = model.__class__.__name__

print(f"The test accuracy using a {model_name} is {accuracy:.3f}")

The test accuracy using a KNeighborsClassifier is 0.821
