In [112]:
import sys
import numpy as np
import pandas as pd
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display
import copy
np.random.seed(1)

In [113]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital', 'occupation', 'relationship', 'race', 'gender', 'capgain', 'caploss', 'hours', 'country', 'income']
df_train = pd.read_csv('adult.data', names=cols, sep=",")
df_test = pd.read_csv('adult.test', names=cols, sep=",")

In [114]:
 def preprocess(df):
    df.isin(['?']).sum(axis=0)

    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace('?',np.nan)
    df['workclass'] = df['workclass'].replace('?',np.nan)
    df['occupation'] = df['occupation'].replace('?',np.nan)

    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
            
    df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
    df['gender'] = df['gender'].map({'Male': 1, 'Female': 0}).astype(int)
    df['workclass'] = df['workclass'].map({'State-gov': 0, 'Self-emp-not-inc': 1, 'Private': 2, 'Federal-gov': 3, 'Local-gov': 4, '?': 5,
                                           'Self-emp-inc': 6, 'Without-pay': 7, 'Never-worked': 8}).astype(int)
    df['education'] = df['education'].map({'Bachelors': 0, 'HS-grad': 1, '11th': 2, 'Masters': 3, '9th': 4, 
                                           'Some-college': 5, 'Assoc-acdm': 6, 'Assoc-voc': 7, '7th-8th': 8, 'Doctorate': 9, 
                                           'Prof-school': 10, '5th-6th': 11, '10th': 12, '1st-4th': 13, 'Preschool': 14, '12th': 15}).astype(int)
    df['marital'] = df['marital'].map({'Never-married': 0, 'Married-civ-spouse': 1, 'Divorced': 2, 'Married-spouse-absent': 3, 
                                                     'Separated': 4, 'Married-AF-spouse': 5, 'Widowed': 6}).astype(int)
    df['occupation'] = df['occupation'].map({'Adm-clerical': 0, 'Exec-managerial': 1, 'Handlers-cleaners': 2, 
                                             'Prof-specialty': 3, 'Other-service': 4, 'Sales': 5, 'Craft-repair': 6, 'Transport-moving': 7, 'Farming-fishing': 8, 
                                             'Machine-op-inspct': 9, 'Tech-support': 10, '?': 11, 'Protective-serv': 12, 'Armed-Forces': 13, 'Priv-house-serv': 14}).astype(int)
    df['relationship'] = df['relationship'].map({'Not-in-family': 0, 'Husband': 1, 'Wife': 2, 
                                                 'Own-child': 3, 'Unmarried': 4, 'Other-relative': 5}).astype(int)
    df['race'] = df['race'].map({'White': 0, 'Black': 1, 'Asian-Pac-Islander': 2, 'Amer-Indian-Eskimo': 3, 'Other': 4}).astype(int)
    df['country'] = df['country'].map({'United-States': 0, 'Cuba': 1, 'Jamaica': 2, 'India': 3, '?': 4, 'Mexico': 5, 'South': 6, 'Puerto-Rico': 7, 
                                       'Honduras': 8, 'England': 9, 'Canada': 10, 'Germany': 11, 'Iran': 12, 'Philippines': 13, 'Italy': 14, 
                                       'Poland': 15, 'Columbia': 16, 'Cambodia': 17, 'Thailand': 18, 'Ecuador': 19, 'Laos': 20, 'Taiwan': 21, 
                                       'Haiti': 22, 'Portugal': 23, 'Dominican-Republic': 24, 'El-Salvador': 25, 'France': 26, 'Guatemala': 27, 
                                       'China': 28, 'Japan': 29, 'Yugoslavia': 30, 'Peru': 31, 'Outlying-US(Guam-USVI-etc)': 32, 'Scotland': 33,
                                       'Trinadad&Tobago': 34, 'Greece': 35, 'Nicaragua': 36, 'Vietnam': 37, 'Hong': 38, 'Ireland': 39, 'Hungary': 40, 
                                       'Holand-Netherlands': 41}).astype(int)
    
    
    labels = df['age']
    proc = []
    for v in labels:
            if v <= 30:
                proc.append(1)
            elif v <= 40:
                proc.append(2)
            elif v <= 50:
                proc.append(3)
            else:
                proc.append(4)
    df['age']=proc 
    
    labels = df['hours']
    proc=[]
    for v in labels:
        if v<=25:
            proc.append(1)
        elif v<=41:
            proc.append(2)
        elif v<=55:
            proc.append(3)
        else:
            proc.append(4)
    df['hours']=proc
    
    df = df.drop(['fnlwgt', 'education.num', 'capgain', 'caploss', 'country'], axis = 1, inplace = True) 

**One-hot encoding**

In [115]:
 def one_hot_encode(df):
    df.isin(['?']).sum(axis=0)

    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace('?',np.nan)
    df['workclass'] = df['workclass'].replace('?',np.nan)
    df['occupation'] = df['occupation'].replace('?',np.nan)

    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
    df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
    df = pd.concat([df, pd.get_dummies(df['gender'], prefix='gender')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['race'], prefix='race')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['marital'], prefix='marital')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['workclass'], prefix='workclass')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['relationship'], prefix='relationship')],axis=1)
    df = pd.concat([df, pd.get_dummies(df['occupation'], prefix='occupation')],axis=1)

    df = df.drop(columns=['workclass', 'gender', 'fnlwgt', 'education', 'occupation', \
                      'relationship', 'marital', 'race', 'country', 'capgain', \
                      'caploss'])
    return df

In [116]:
# not one-hot (for randomforestclassifier and such)
# preprocess(df_train)
# preprocess(df_test)

# one-hot encoding (for regression mdoels)
df_train = one_hot_encode(df_train)
df_test = one_hot_encode(df_test)

In [117]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

**Parametric Model**

In [118]:
X_train = df_train.drop(columns='income')
y_train = df_train['income']

X_test = df_test.drop(columns='income')
y_test = df_test['income']

# Scale data
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

# clf = RandomForestClassifier(max_depth=10, random_state=0)

# from sklearn.svm import SVC
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

# Logistic regression?
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
num_params = len(clf.coef_.transpose())
# print(classification_report(y_test, y_pred, target_names=['0', '1']))
# clf.classes_
# clf.coef_

**Loss function**

Twice differentiable

In [119]:
import math
def logistic_loss(y_test, y_pred):
    loss = 0
    for i in range(len(y_test)):
        loss += - y_test[i] * math.log(y_pred[i][1]) - (1 - y_test[i]) * math.log(y_pred[i][0])
    return loss/len(y_test)

print(logistic_loss(y_test, y_pred))

0.3609074672032075


**First-order derivative of loss function at z with respect to model parameters**

In [120]:
def del_L_del_theta_i(num_params, y_true, x, y_pred):
    del_L_del_theta = []
    for j in range(num_params):
        del_L_del_theta_j = ((1 - y_true) * y_pred[0] + y_true * (1 - y_pred[1])) * x[j]
        del_L_del_theta.append(del_L_del_theta_j)
    return del_L_del_theta

In [121]:
# loss_gradient_at_z(clf.coef_, y_train[0], X_train[0], clf.predict_proba(X_train[0]))
# print(len(clf.coef_.transpose()))
# print(y_train[0])
# print(X_train[0][4])
# print(clf.predict_proba(np.reshape(X_train[0], (1, 44))))

y_pred = clf.predict_proba(np.reshape(X_train[0], (1, num_params)))
# print(y_pred[0][0])
loss_grad_at_z = del_L_del_theta_i(num_params, y_train[0], X_train[0], y_pred[0])

**Second-order partial derivative of loss function with respect to model parameters**

In [122]:
import numpy
def hessian_one_point(num_params, y_true, x, y_pred):
    multiplier = y_pred[0] * y_pred[1]
    Hx = numpy.zeros((num_params, num_params))
    for i in range(num_params):
        for j in range(i + 1):
            Hx[i][j] = x[i] * x[j]
    
    i_lower = np.tril_indices(num_params, -1)
    Hx.T[i_lower] = Hx[i_lower]     
    return Hx

**Hessian matrix of loss function with respect to model parameters**

Verified symmetric

In [125]:
def compute_hessian(num_params, X_train, y_train):
    H = numpy.zeros((num_params, num_params))
    for i in range(len(X_train)):
        y_pred = clf.predict_proba(np.reshape(X_train[i], (1, num_params)))
        H += hessian_one_point(num_params, y_train[i], X_train[i], y_pred[0])
    H /= len(X_train)
    return H

In [128]:
hxx = compute_hessian(num_params, X_train, y_train)

In [129]:
hxx

array([[ 1.        ,  0.04352609,  0.10159876, ..., -0.02931911,
        -0.01847592,  0.03239653],
       [ 0.04352609,  1.        ,  0.15252207, ...,  0.02548368,
         0.05856886, -0.12592687],
       [ 0.10159876,  0.15252207,  1.        , ..., -0.00514497,
        -0.02148375,  0.07279196],
       ...,
       [-0.02931911,  0.02548368, -0.00514497, ...,  1.        ,
        -0.06484214, -0.08610771],
       [-0.01847592,  0.05856886, -0.02148375, ..., -0.06484214,
         1.        , -0.04140508],
       [ 0.03239653, -0.12592687,  0.07279196, ..., -0.08610771,
        -0.04140508,  1.        ]])