In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

#Import models for classification task

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score,recall_score, accuracy_score,confusion_matrix, plot_confusion_matrix, classification_report, auc,roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


import xgboost

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, BatchNormalization, Dropout
from tensorflow.keras import optimizers

print(f"Tensorflow Version: {tf.version.VERSION}")


import missingno
import warnings
warnings.filterwarnings("ignore")


## Import Data

In [None]:
train = pd.read_csv("../input/income-adult/adult_data.csv")
test = pd.read_csv("../input/income-adult/adult_test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.columns

In [None]:
#Column names have spaces on either end, so remove them
new_cols = [col.strip() for col in train.columns]
train.columns = new_cols
test.columns = new_cols

In [None]:
train.columns

In [None]:
#Separate training and test set into features and target

X_train = train.drop(["salary"], axis = 1)
y_train = train[["salary"]]

X_test = test.drop(["salary"], axis = 1)
y_test = test[["salary"]]

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
X_train.dtypes

In [None]:
#separate numerical and categorical variables
num_cols = X_train.select_dtypes(include = "int64")
cat_cols = X_train.select_dtypes(include = "object")

num_cols.head()

In [None]:
cat_cols.head()

## Plot features for better understanding

In [None]:
sns.countplot(train["salary"])
plt.title("Count of variable to predict")
plt.show()

In [None]:
print(f"There are {len(cat_cols.columns)} categorical variables in the training set.")

In [None]:
#Plot count plots for all categorical variables
plt.figure(figsize = (23,15))

for i,var in enumerate(cat_cols.columns):
    plt.subplot(4,2,i+1)
    sns.countplot(X_train[var])
plt.subplots_adjust(hspace = 0.4)
plt.show()
    

In [None]:
print(f"There are {len(num_cols.columns)} numerical variables in the training set.")

In [None]:
plt.figure(figsize= (23,12))
for i,var in enumerate(num_cols.columns):
    plt.subplot(2,3,i + 1)
    sns.histplot(X_train[var])
plt.show()

# Data Cleaning
## Check for missing values and null values

### Training set

In [None]:

print(X_train.isna().mean())
print(X_train.isnull().mean())
missingno.matrix(X_train, figsize = (10,10))
plt.show()

### Test set

In [None]:
print(X_test.isna().mean())
print(X_test.isnull().mean())

missingno.matrix(X_test, figsize = (10,10))
plt.show()

In [None]:
X_train.head()

### Clean label column

In [None]:
#Replace <= 50k with 0 and >50k with 1 for modelling

#print(y_train["salary"].unique(), y_test["salary"].unique())
y_train = y_train.replace({y_train["salary"].unique()[0]: 0, y_train["salary"].unique()[1] : 1})
y_test = y_test.replace({y_test["salary"].unique()[0]: 0, y_test["salary"].unique()[1] : 1})

In [None]:
print(f"The shape of X_train is {X_train.shape}.")
print(f"The shape of X_test is {X_test.shape}.")

X_train_len = len(X_train)
X_test_len = len(X_test)

### One Hot Encoding

Combine test and training set to one hot encode ensuring all variables are taken into account

In [None]:
X = pd.concat([X_train,X_test], axis = 0)
print(f"The shape of X is {X.shape}.")

X.head()

In [None]:
#One Hot Encoding
X_new = pd.get_dummies(X, columns = cat_cols.columns)
print(X_new.shape)
X_new.head()

Separate training and test set the same way it was previously

In [None]:
#Recreate X_train and X_test with one hot encoded features
X_train = X_new.iloc[0:X_train_len]
X_test = X_new.iloc[X_train_len:]

print(f"The shape of X_train is {X_train.shape}.")
print(f"The shape of X_test is {X_test.shape}.")

In [None]:
X_train.head()

Create a validation set from the training data for model fine tuning

In [None]:
#Split into validation/test set


X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.3, random_state = 1, stratify = y_train)

In [None]:
print(f"The shape of X_train is {X_train.shape}.")
print(f"The shape of X_val is {X_val.shape}.")
print(f"The shape of X_test is {X_test.shape}.")

print(f"The shape of y_train is {y_train.shape}.")
print(f"The shape of y_val is {y_val.shape}.")
print(f"The shape of y_test is {y_test.shape}.")

Ensure the data is stratified properly after split as the dataset is unbalanced

In [None]:
plt.subplot(1,3,1)
sns.countplot(y_train["salary"])
plt.title("Train Data")

plt.subplot(1,3,2)
sns.countplot(y_val["salary"])
plt.title("Validation Data")

plt.subplot(1,3,3)
sns.countplot(y_test["salary"])
plt.title("Test Data")
plt.subplots_adjust(wspace = 2)
plt.show()

Scale the data 

In [None]:
sc = StandardScaler()

X_train[num_cols.columns] =  sc.fit_transform(X_train[num_cols.columns])
X_val[num_cols.columns] =  sc.transform(X_val[num_cols.columns])
X_test[num_cols.columns] =  sc.transform(X_test[num_cols.columns])

X_test.head()

In [None]:
y_test["salary"].value_counts()

## Modelling

### Logistic Regression

In [None]:
log_reg = LogisticRegression()


log_reg.fit(X_train,y_train)

print(f"Accuracy on the training set is {log_reg.score(X_train,y_train)}.")
print(f"Accuracy on the validation set is {log_reg.score(X_val,y_val)}.")
print(f"Accuracy on the test set is {log_reg.score(X_test,y_test)}.")

Grid Search for tuning

In [None]:
params = {"penalty" : ["l1", "l2", "elasticnet"],
         "C": [0.01, 0.05, 0.1, 0.5, 1, 2]}

grid_cv = GridSearchCV(LogisticRegression(n_jobs = -1),params)

grid_cv.fit(X_train, y_train)

In [None]:
print(grid_cv.best_params_)
#Best params are the default values so no need for a new model

In [None]:
log_preds = log_reg.predict(X_test)
log_cm = confusion_matrix(y_test,log_preds)
print(log_cm)


In [None]:
log_precision =precision_score(y_test, log_preds)
log_recall=recall_score(y_test, log_preds)
log_accuracy = accuracy_score(y_test, log_preds)

print("Precision = {}".format(log_precision))
print("Recall = {}".format(log_recall))
print("Accuracy = {}".format(log_accuracy))

print("Area under the curve: {}.".format(roc_auc_score(y_test,log_reg.decision_function(X_test)))) 

### K-nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_jobs = -1)
knn.fit(X_train,y_train)

print(f"Accuracy on the training set is {knn.score(X_train,y_train)}.")
print(f"Accuracy on the validation set is {knn.score(X_val,y_val)}.")
print(f"Accuracy on the test set is {knn.score(X_test,y_test)}.")

Manually calculate the best k-value using the validation set

In [None]:
validation_accuracies = []

for i in range(5,50,5):
    knn = KNeighborsClassifier(n_jobs = -1,n_neighbors = i )
    knn.fit(X_train,y_train)
    validation_accuracies.append(knn.score(X_val, y_val))
    

In [None]:
plt.plot(range(5,50,5), validation_accuracies)
plt.title("Validation accuracies for different k values")
plt.xlabel("k values")
plt.ylabel("Accuracy")
plt.show()

In [None]:
k_values = range(5,50,5)

print(f"The best k-value is: {k_values[validation_accuracies.index(max(validation_accuracies))]}.")

In [None]:
knn_best = KNeighborsClassifier(n_jobs = -1,n_neighbors = 20 )
knn_best.fit(X_train,y_train)

In [None]:
knn_preds = knn_best.predict(X_test)
knn_cm = confusion_matrix(y_test,knn_preds)
print(knn_cm)

knn_precision =precision_score(y_test, knn_preds)
knn_recall = recall_score(y_test, knn_preds)
knn_accuracy = accuracy_score(y_test, knn_preds)

print("Precision = {}".format(knn_precision))
print("Recall = {}".format(knn_recall))
print("Accuracy = {}".format(knn_accuracy))


### XGBoost

In [None]:
xgb = xgboost.XGBClassifier(nthread = -1)
xgb.fit(X_train,y_train)

In [None]:
print(f"Accuracy on the training set is {xgb.score(X_train,y_train)}.")
print(f"Accuracy on the validation set is {xgb.score(X_val,y_val)}.")
print(f"Accuracy on the test set is {xgb.score(X_test,y_test)}.")

In [None]:
parameters = {
     "eta"    : [0.05, 0.15,0.30 ] ,
     "n_estimators" : [ 50,100,200],
     #"min_child_weight" : [ 1, 5, 7 ],
     #"gamma"            : [ 0.0, 0.2 , 0.4 ],
     #"colsample_bytree" : [ 0.3, 0.5 , 0.7 ],
    "learning_rate":[0.001,0.01,0.1]
     }

random_cv = RandomizedSearchCV(xgboost.XGBClassifier(nthreads = -1),
                    parameters, n_jobs=-1,
                    cv=3, random_state = 0)

random_cv.fit(X_train, y_train)

In [None]:
random_cv.best_params_

In [None]:
xgb_best = xgboost.XGBClassifier(nthreads = -1,n_estimators = 200, learning_rate = 0.1, eta = 0.15 )
xgb_best.fit(X_train,y_train)

In [None]:
xgb_preds = xgb_best.predict(X_test)
xgb_cm = confusion_matrix(y_test,xgb_preds)
print(xgb_cm)

xgb_precision =precision_score(y_test, xgb_preds)
xgb_recall = recall_score(y_test, xgb_preds)
xgb_accuracy = accuracy_score(y_test, xgb_preds)

print("Precision = {}".format(xgb_precision))
print("Recall = {}".format(xgb_recall))
print("Accuracy = {}".format(xgb_accuracy))
    

### Random Forest

In [None]:
rf = RandomForestClassifier(n_jobs = -1, random_state = 0)
rf.fit(X_train, y_train)

In [None]:
print(f"Accuracy on the training set is {rf.score(X_train,y_train)}.")
print(f"Accuracy on the validation set is {rf.score(X_val,y_val)}.")
print(f"Accuracy on the test set is {rf.score(X_test,y_test)}.")

The random forest is clearly overfitting

In [None]:
params = {"n_estimators":range(10,400,50),
         "max_depth":range(5,100,10),
          "criterion":["gini", "entropy"],
          "min_samples_split":range(2,10,2)
         }

random_cv = RandomizedSearchCV(RandomForestClassifier(), params, n_jobs = -1, random_state = 1)

random_cv.fit(X_train,y_train)

In [None]:
random_cv.best_params_

In [None]:
validation_accuracies = []

for i in range(160,300,10):
    rf = RandomForestClassifier(n_jobs = -1,n_estimators = i , random_state = 0)
    rf.fit(X_train,y_train)
    validation_accuracies.append(rf.score(X_val, y_val))

In [None]:
plt.plot(range(160,300,10), validation_accuracies)
plt.title("Validation accuracies for different number of estimators")
plt.xlabel("number of estimators")
plt.ylabel("Accuracy")
plt.show()

In [None]:
n_estimators = range(160,300,10)

print(f"The best number of estimators is: {n_estimators[validation_accuracies.index(max(validation_accuracies))]}.")

In [None]:
rf_best = RandomForestClassifier(min_samples_split = 6, max_depth = 25, criterion = 'entropy', n_estimators = 200)
rf_best.fit(X_train,y_train)

In [None]:
rf_preds = rf_best.predict(X_test)
rf_cm = confusion_matrix(y_test,rf_preds)
print(rf_cm)

rf_precision =precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_accuracy = accuracy_score(y_test, rf_preds)

print("Precision = {}".format(rf_precision))
print("Recall = {}".format(rf_recall))
print("Accuracy = {}".format(rf_accuracy))

### Neural Network

In [None]:
def build_nn_model(metric = "accuracy", learning_rate = 0.01):
    
    model = Sequential()
    
    model.add(Dense(32, input_shape = (X_train.shape[1],)))
    model.add(Dense(64, Activation("relu")))
    model.add(Dense(128, Activation("relu")))
    model.add(Dense(128, Activation("relu")))
    model.add(Dense(1))
    
    learning_rate = learning_rate
    optimizer = optimizers.Adam(learning_rate)
    model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
                 optimizer = optimizer,
                 metrics = [metric])
    
    return model

model = build_nn_model(metric = "binary_accuracy")
model.summary()

In [None]:
EPOCHS = 30
batch_size = 16


history = model.fit(
    X_train,
    y_train,
    batch_size = batch_size, 
    epochs = EPOCHS,
    verbose = 1,
    validation_data = (X_val,y_val))

In [None]:
score = model.evaluate(X_test, y_test)
score

In [None]:
hist = pd.DataFrame(history.history)
hist["epoch"] = history.epoch
hist.head()

In [None]:
plt.figure(figsize = (12,8))
plt.plot(history.history["binary_accuracy"])
plt.plot(history.history["val_binary_accuracy"])
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Training Accuracy","Validation Accuracy"])
plt.show()

In [None]:
nn_preds = model.predict(X_test)
nn_preds = (nn_preds > 0.5)

nn_precision =precision_score(y_test, nn_preds)
nn_recall = recall_score(y_test, nn_preds)
nn_accuracy = accuracy_score(y_test, nn_preds)

print("Precision = {}".format(nn_precision))
print("Recall = {}".format(nn_recall))
print("Accuracy = {}".format(nn_accuracy))

## Comparing Models

In [None]:
#Logistic
log_results = ["Logistic Regression",log_precision, log_recall, log_accuracy]

#Knn
knn_results = ["K-Nearest Neighbours",knn_precision, knn_recall, knn_accuracy]

#Random Forest
rf_results = ["Random Forest",rf_precision, rf_recall, rf_accuracy]

#XGBoost
xgb_results = ["XGBoost",xgb_precision, xgb_recall, xgb_accuracy]

#Neural Network
nn_results = ["Neural Network",nn_precision, nn_recall, nn_accuracy]


all_results = pd.DataFrame([log_results, knn_results, rf_results, xgb_results, nn_results],columns = ["Model","Precision", "Recall", "Accuracy"])

all_results

In [None]:
fig = plt.figure(figsize = (10,10))
all_results.plot(x = "Model", y = ["Precision", "Recall", "Accuracy"], kind = "bar")
plt.legend(loc = "upper right", bbox_to_anchor=(1.3, 1))
plt.show()

In [None]:
#PLot Cm
plt.figure(figsize = (6,5))
ax = plt.subplot()
sns.heatmap(xgb_cm, ax=ax, annot = True)
ax.set_ylabel("True Labels")
ax.set_xlabel("Predicted Labels")
ax.set_title("Confusion Matrix for XGBoost Classifier")
plt.show()