In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt

In [2]:
# load data
df = pd.read_csv('adult.csv')

In [3]:
# check columns that have missing values
df.isin(['?']).sum(axis=0)

# replace missing values (?) to nan and then drop the columns 
df['native.country'] = df['native.country'].replace('?',np.nan)
df['workclass'] = df['workclass'].replace('?',np.nan)
df['occupation'] = df['occupation'].replace('?',np.nan)

# #dropping the NaN rows now 
df.dropna(how='any',inplace=True)

In [4]:
# mapping the data into numerical data using map function
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)
df['sex'] = df['sex'].map({'Male': 0, 'Female': 1}).astype(int)
df['race'] = df['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, 'Amer-Indian-Eskimo': 4}).astype(int)
df['marital.status'] = df['marital.status'].map({'Married-spouse-absent': 0, 'Widowed': 1, 'Married-civ-spouse': 2, 'Separated': 3, 'Divorced': 4,'Never-married': 5, 'Married-AF-spouse': 6}).astype(int)
df['workclass'] = df['workclass'].map({'Self-emp-inc': 0, 'State-gov': 1,'Federal-gov': 2, 'Without-pay': 3, 'Local-gov': 4,'Private': 5, 'Self-emp-not-inc': 6, 'Never-worked':7}).astype(int)
df['occupation'] = df['occupation'].map({'Exec-managerial': 1, 'Machine-op-inspct': 2, 'Prof-specialty': 3, 'Other-service': 4, 'Adm-clerical': 5, 'Craft-repair': 6, 'Transport-moving': 7, 'Handlers-cleaners': 8, 'Sales': 9, 'Farming-fishing': 10, 'Tech-support': 11, 'Protective-serv': 12, 'Armed-Forces': 13, 'Priv-house-serv': 14}).astype(int)
df['relationship'] = df['relationship'].map({'Not-in-family': 0, 'Wife': 1, 'Other-relative': 2, 'Unmarried': 3,'Husband': 4,'Own-child': 5}).astype(int)
df['native.country'] = df['native.country'].map({'United-States': 0, 'Cambodia': 1, 'England': 2, 'Puerto-Rico': 3, 'Canada': 4, 'Germany': 5, 'Outlying-US(Guam-USVI-etc)': 6, 'India': 7, 'Japan': 8, 'Greece': 9, 'South': 10, 'China': 11, 'Cuba': 12, 'Iran': 13, 'Honduras': 14, 'Philippines': 15, 'Italy': 16, 'Poland': 17, 'Jamaica': 18, 'Vietnam': 19, 'Mexico': 20, 'Portugal': 21, 'Ireland': 22, 'France': 23, 'Dominican-Republic': 24, 'Laos': 25, 'Ecuador': 26, 'Taiwan': 27, 'Haiti': 28, 'Columbia': 29, 'Hungary': 30, 'Guatemala': 31, 'Nicaragua': 32, 'Scotland': 33, 'Thailand': 34, 'Yugoslavia': 35, 'El-Salvador': 36, 'Trinadad&Tobago': 37, 'Peru': 38, 'Hong': 39, 'Holand-Netherlands': 40}).astype(int)

In [5]:
# split data into train and test sets. 'stratify' to ensure similar distribution of class label in train and test data 
train, test = train_test_split(df, test_size=0.2, stratify=df['income'])

In [6]:
# logistic regression. Y=income, X=everything else
# X_train = train['education.num'] # no need for education if education.num is there
# X_train = train.columns.difference(['income', 'education.num'])
X_train = pd.DataFrame(np.c_[train['relationship'], train['education.num'], train['race'], train['occupation'], train['sex'], train['marital.status'], train['workclass']], columns = ['relationship','education','race','occupation','sex','marital','workclass'])
y_train = train['income']

#Initialize the logistic regression model
reg = LogisticRegression()
reg.fit(X_train, y_train)

# score the model on training data
# reg.score(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
X_test = pd.DataFrame(np.c_[test['relationship'], test['education.num'], test['race'], test['occupation'], test['sex'], test['marital.status'], test['workclass']], columns = ['relationship','education','race','occupation','sex','marital','workclass'])
y_test = test['income']
y_pred = reg.predict(X_test)
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [8]:
# check if data satisfies statistical parity. Compute statistical parity difference
# P(income='>50K' | sex='female') - P(income='>50K' | sex='female')
def compute_statistical_parity_difference(X_test, y_pred):
    income_high_sex_female = 0
    income_high_sex_male = 0
    for i in range(0, len(y_pred)):
        if (X_test['sex'].iloc[i] == 1 and y_pred[i] == 1): #female, high income
            income_high_sex_female += 1
        elif (X_test['sex'].iloc[i] == 0 and y_pred[i] == 1): #male, high income
            income_high_sex_male += 1

    # rows where sex='Female'
    sex_female = X_test[X_test['sex'] == 1]
    # conditional probability of income='high' given sex='Female'
    p_income_high_sex_female = income_high_sex_female/len(sex_female)

    # rows where sex='Male'
    sex_male = X_test[X_test['sex'] == 0]
    # conditional probability of income='high' given sex='Male'
    p_income_high_sex_male = income_high_sex_male/len(sex_male)

#     print("P(income='high' | sex='Female') = " + str(p_income_high_sex_female))
#     print("P(income='high' | sex='Male') = " + str(p_income_high_sex_male))

    statistical_parity_diff = p_income_high_sex_female - p_income_high_sex_male
    return statistical_parity_diff

In [9]:
X_train = pd.DataFrame(np.c_[train['relationship'], train['education.num'], train['race'], train['occupation'], train['sex'], train['marital.status'], train['workclass']], columns = ['relationship','education','race','occupation','sex','marital','workclass'])
y_train = train['income']

# retrain model by removing one row at a time; compute statistical parity difference
reg = LogisticRegression()
for i in range(0, len(X_train)):
    X_train_remove_one_row = X_train.drop(X_train.index[i])
    y_train_remove_one_row = y_train.drop(y_train.index[i])
    reg.fit(X_train_remove_one_row, y_train_remove_one_row)
    y_pred = reg.predict(X_test)
    print(compute_statistical_parity_difference(X_test, y_pred))

-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872
-0.18608385337009872


KeyboardInterrupt: 