# Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/adult-dataset/adult.csv", header=None)

In [None]:
df.head()

In [None]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', \
           'occupation', 'relationship', 'race', 'gender', 'gain', 'loss', 'hpw', 'country', \
           'income']
# gain    ==> capital gain
# loss    ==> capital loss
# hpw     ==> hours per week
# country ==> native country
df.columns = columns

# Column Values Cleaning

In [None]:
def missing_count(X):
    L = len(X)
    ms = X.isnull().sum()    
    df = {"Name":X.columns, "Total":[L]*X.shape[1], "Missing":ms, "Missing(%)":round(ms/L*100, 2), "Dtypes":X.dtypes}
    df = pd.DataFrame(df)
    return df
missing_count(df)

In [None]:
categorical = [col for col in df.columns if df[col].dtype == "O"]
print(categorical)

In [None]:
numerical = [col for col in df.columns if df[col].dtype != "O"]
print(numerical)

## Unique Values

In [None]:
for col in categorical:
    print("=====", col, "=====")
    print(df[col].unique(), end="\n\n")

In [None]:
for col in categorical:
    def mapper(val):
        return val[1:]
    df[col] = df[col].apply(mapper)

## "workclass", "occupation", "country"

In [None]:
df["workclass"].value_counts()

In [None]:
1836 / len(df) * 100  #  % values missing
# We will take care about it later

In [None]:
df["occupation"].value_counts()

In [None]:
1843 / len(df) * 100  #  % values missing
# We will take care about it later

In [None]:
df["country"].value_counts()

In [None]:
583 / len(df) * 100  #  % values missing
# We will take care about it later

In [None]:
for col in ["workclass", "occupation", "country"]:
    df[col].replace("?", np.NaN, inplace=True)

In [None]:
for col in ["workclass", "occupation", "country"]:
    print(df[col].unique(), end="\n\n")

## "income" Column

In [None]:
df["income"].unique()

In [None]:
def mapper(val):
    if(val == "<=50K"):
        return 0
    else:
        return 1
df["income"] = df["income"].apply(mapper)

# Null values Filling using mode

In [None]:
null_rows = []
for i in range(len(df)):
    if(df.loc[i].isnull().sum()):
        null_rows.append(i)

In [None]:
len(null_rows)

In [None]:
len(null_rows) / len(df) * 100

In [None]:
missing_count(df)

In [None]:
for i in ["workclass", "occupation", "country"]:
    df[i].fillna(df[i].mode()[0], inplace=True)

In [None]:
missing_count(df)

# Model Buliding

In [None]:
X = df.drop(["income"], axis=1)
y = df["income"]

In [None]:
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
                                 'race', 'gender', 'country'])
X = encoder.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 77)

In [None]:
X_train.shape, X_test.shape

In [None]:
cols = X.columns

In [None]:
from sklearn.preprocessing import RobustScaler

robust = RobustScaler()
X_train = robust.fit_transform(X_train)
X_test = robust.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
y_pred = gnb.predict(X_test)

In [None]:
"train score", gnb.score(X_train, y_train)

In [None]:
"test score", gnb.score(X_test, y_test)

In [None]:
maxi = y_train.value_counts().idxmax()
null_acc = y_test.value_counts()[maxi] / y_test.shape[0]
print("Null Accuracy: ", null_acc)

In [None]:
from sklearn.metrics import confusion_matrix

def confusion_heatmap(y_test, y_pred, label_mapping=None, normalize=None):
    labels = np.unique(np.concatenate((np.unique(y_test), np.unique(y_pred)), axis=0))
    cm = confusion_matrix(y_test, y_pred, labels=labels, normalize=normalize)
    
    mapping = labels
    if(label_mapping):
        mapping = [name_mapping[l] for l in labels]

    d = pd.DataFrame(cm)
    d.columns = mapping
    d.index = mapping

    sns.heatmap(d, annot=True, fmt=".4g", cmap="Blues", )
    plt.ylabel('True label',fontsize=12)
    plt.xlabel('Predicted label',fontsize=12)
    plt.show();
confusion_heatmap(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report

round(pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T, 2)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

In [None]:
classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)
classification_accuracy

In [None]:
classification_error = (FP + FN) / float(TP + TN + FP + FN)
classification_error

In [None]:
precision = TP / float(TP + FP)
precision

In [None]:
recall = TP / float(TP + FN)
recall

In [None]:
true_positive_rate = TP / float(TP + FN)
true_positive_rate

In [None]:
false_positive_rate = FP / float(FP + TN)
false_positive_rate

In [None]:
specificity = TN / (TN + FP)
specificity

In [None]:
y_pred = gnb.predict_proba(X_test)[:, 1]

plt.rcParams['font.size'] = 12
plt.hist(y_pred, bins =10)
plt.title('Histogram of predicted probabilities of 1')
plt.xlim(0,1)
plt.xlabel('Predicted probabilities of 1')
plt.ylabel('Frequency')
plt.show()

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
plt.title('ROC curve for Gaussian Naive Bayes Classifier for Predicting Salaries')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

ROC_AUC = roc_auc_score(y_test, y_pred)
ROC_AUC