
This notebook performs Binary Classification on the "Income Classification Dataset" using 2 different methods: Random Forest and KNN.

Dataset: https://www.kaggle.com/lodetomasi1995/income-classification

References:
https://towardsdatascience.com/understanding-random-forest-58381e0602d2
https://towardsdatascience.com/machine-learning-basics-with-the-k-nearest-neighbors-algorithm-6a6e71d01761
https://www.kaggle.com/prashant111/random-forest-classifier-feature-importance
https://www.kaggle.com/prashant111/knn-classifier-tutorial


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# importing the required libraries

import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# loading the dataset and checking top elements
df = pd.read_csv('/kaggle/input/income-classification/income_evaluation.csv')
df.head()
random_state = 0

In [None]:
# checking categorical columns for invalid data values
categorical = [x for x in df.columns if df[x].dtype == 'O']
numerical = [var for var in df.columns if df[var].dtype != 'O']

for var in categorical:
    print(df[var].value_counts() / np.float(len(df)))


In [None]:
# discovered categorical columns with unknown values: workclass, occupation, native-country
print(f"workclass column: {df[' workclass'].unique()}\n")
print(f"occupation column: {df[' occupation'].unique()}\n")
print(f"native-country columns: {df[' native-country'].unique()}\n")

In [None]:
# replacing the unknown values with "mode" (imputation step)
cols_with_unk = [' workclass', ' occupation', ' native-country']

for col in cols_with_unk:
    df[col].replace(' ?', np.NaN, inplace=True)
    df[col].fillna(df[col].mode()[0], inplace=True)
    print(df[col].value_counts())


In [None]:
# check cardinality (number of distinct values for each column)
for var in categorical:
    print(var, ' contains ', len(df[var].unique()), ' labels')

In [None]:
# printing the correlation between numerical values in the dataset
df.corr().style.format("{:.4}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

In [None]:
# n_neighbors = 161

# preparing the input and labels for the binary classification
y = df[' income']
x = df.drop([' income'], axis=1)

# splitting into train/test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,
                     random_state=random_state)

print(X_train.isnull().sum().sum())
print(X_test.isnull().sum().sum())

# dim = len(x[0])
# n_classes = len(np.unique(y))

In [None]:
# using OneHotEncoder to encode the categorical values in the training and test sets
categorical.remove(' income')
encoder = ce.OneHotEncoder(cols=categorical)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)


In [None]:
# checking the training and testing inputs
print(X_train.head())
print(X_train.shape)
print(X_test.head())
print(X_test.shape)


In [None]:
# using the robustScaler to reduce the effect of outliers
cols = X_train.columns
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
# Random Forest Classifier
rfc = RandomForestClassifier(random_state=0)
start_time = time.time()
rfc.fit(X_train, y_train)
print(f'Random Forest training took {time.time()-start_time}.')
start_time = time.time()
y_pred = rfc.predict(X_test)
print(f'Random Forest test took {time.time()-start_time}.')

In [None]:
# Checking accuracy score
print('Random-Forest accuracy (10 decision-trees) : {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))

# Printing the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Random Forest confusion matrix\n\n', cm)
cm_matrix = pd.DataFrame(data=cm, columns=['Real Positive:1', 'Real Negative:0'],
                         index=['Predicted Positive:1', 'Predicted Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d')


In [None]:
# KNN Classifier with N taken as the square root of size of training set
n_neighbors = 161

# dimension reduction to 2 using PCA
pca = PCA(n_components=2, random_state=random_state)
knn = KNeighborsClassifier(n_neighbors=n_neighbors)

X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

# Training the KNN on the reduced training set
start_time = time.time()
knn.fit(X_train_reduced, y_train)
print(f'KNN training took {time.time()-start_time}.')

start_time = time.time()
y_pred = knn.predict(X_test_reduced)
print(f'KNN test took {time.time()-start_time}.')


In [None]:
# Checking accuracy score
print('KNN accuracy : {0:0.4f}'.format(knn.score(X_test_reduced, y_test)))

print(classification_report(y_test, y_pred))

# Printing the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('KNN confusion matrix\n\n', cm)
cm_matrix = pd.DataFrame(data=cm, columns=['Real Positive:1', 'Real Negative:0'],
                         index=['Predicted Positive:1', 'Predicted Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d')


