In [1]:
import numpy as np
import pandas as pd
import io
import requests
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import os
from pandas.api.types import CategoricalDtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
%matplotlib inline


In [2]:
def load_dataset(path, urls):
    if not os.path.exists(path):
        os.mkdir(path)

    for url in urls:
        data = requests.get(url).content
        filename = os.path.join(path, os.path.basename(url))
        with open(filename, "wb") as file:
            file.write(data)

In [3]:
urls = ["http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"]
load_dataset('data', urls)


In [4]:
columns = ["age", "workClass", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship",
          "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
           
           
train_data = pd.read_csv('data/adult.data', names=columns, 
             sep=' *, *', na_values='?')
test_data  = pd.read_csv('data/adult.test', names=columns, 
             sep=' *, *', skiprows=1, na_values='?')


  train_data = pd.read_csv('data/adult.data', names=columns,
  test_data  = pd.read_csv('data/adult.test', names=columns,


In [5]:
train_data

Unnamed: 0,age,workClass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workClass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
num_attributes = train_data.select_dtypes(include=['int'])
print(num_attributes.columns)
['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [8]:
cat_attributes = train_data.select_dtypes(include=['object'])
print(cat_attributes.columns)
['workClass', 'education', 'marital-status', 'occupation',        'relationship', 'race', 'sex', 'native-country', 'income']


Index(['workClass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')


['workClass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'income']

In [9]:
class ColumnsSelector(BaseEstimator, TransformerMixin):
  
    def __init__(self, type):
        self.type = type
    
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        return X.select_dtypes(include=[self.type])

In [10]:
num_pipeline = Pipeline(steps=[
    ("num_attr_selector", ColumnsSelector(type='int')),
    ("scaler", StandardScaler())
])


In [11]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
  
    def __init__(self, columns = None, strategy='most_frequent'):
        self.columns = columns
        self.strategy = strategy
      
      
    def fit(self,X, y=None):
        if self.columns == None:
            self.columns = X.columns
        
        if self.strategy == 'most_frequent':
            self.fill = {column: X[column].value_counts().index[0] for column in self.columns}
        else:
            self.fill ={column: '0' for column in self.columns}
        return self
        
    def transform(self,X):
        X_copy = X.copy()
        for column in self.columns:
            X_copy[column] = X_copy[column].fillna(self.fill[column])
        return X_copy


In [12]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
  
    def __init__(self, dropFirst=True):
        self.categories=dict()
        self.dropFirst=dropFirst
      
    def fit(self, X, y=None):
        join_df = pd.concat([train_data, test_data])
        join_df = join_df.select_dtypes(include=['object'])
        for column in join_df.columns:
            self.categories[column] = join_df[column].value_counts().index.tolist()
        return self
      
    def transform(self, X):
        X_copy = X.copy()
        X_copy = X_copy.select_dtypes(include=['object'])
        for column in X_copy.columns:
            X_copy[column] = X_copy[column].astype({column: CategoricalDtype(self.categories[column])})
        return pd.get_dummies(X_copy, drop_first=self.dropFirst)


In [13]:
cat_pipeline = Pipeline(steps=[
    ("cat_attr_selector", ColumnsSelector(type='object')),
    ("cat_imputer", CategoricalImputer(columns=
          ['workClass','occupation', 'native-country'])),
    ("encoder", CategoricalEncoder(dropFirst=True))
])


In [14]:
full_pipeline = FeatureUnion([("num_pipe", num_pipeline), ("cat_pipeline", cat_pipeline)])


In [15]:
train_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)
test_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)


In [16]:
train_copy = train_data.copy()
train_copy["income"] = train_copy["income"].apply(lambda x:0 if 
                        x=='<=50K' else 1)
X_train = train_copy.drop('income', axis =1)
Y_train = train_copy['income']

In [17]:
X_train_processed=full_pipeline.fit_transform(X_train)
model = LogisticRegression(random_state=0, max_iter=10000)
model.fit(X_train_processed, Y_train)


LogisticRegression(max_iter=10000, random_state=0)

In [63]:
test_copy = test_data.copy()
test_copy["income"] = test_copy["income"].apply(lambda x:0 if 
                      x=='<=50K.' else 1)
X_test = test_copy.drop('income', axis =1)
Y_test = test_copy['income']

In [64]:
FRAC_MALE = (X_test['sex'] == 'Male').mean()
FRAC_MALE

0.6670351943983784

In [65]:
X_test['sex'] = 'Male'

In [70]:
X_test_processed = full_pipeline.fit_transform(X_test)
predicted_classes = model.predict_proba(X_test_processed)


In [71]:
X_test['sex'] = 'Female'

In [72]:
X_test_processed = full_pipeline.fit_transform(X_test)
predicted_classes_2 = model.predict_proba(X_test_processed)


In [73]:
EO_prob = []

for x in range(0,len(predicted_classes)):
    EO_prob.append( (FRAC_MALE) * predicted_classes[x][1] + ((1 - FRAC_MALE) * predicted_classes_2[x][1]))
    

In [80]:
threshold = pd.Series(EO_prob).quantile(1 - float(Y_test.values.sum())/float(len(Y_test)))

In [81]:
EO_prob_binary = []
for x in EO_prob:
    if x > threshold:
        EO_prob_binary.append(1)
    else:
        EO_prob_binary.append(0)

In [82]:
accuracy_score(EO_prob_binary, Y_test.values)


0.8441127694859039

In [84]:
import random

In [94]:
EO_prob_binary_rand = []
for x in EO_prob:
    if x > random.random():
        EO_prob_binary_rand.append(1)
    else:
        EO_prob_binary_rand.append(0)

In [95]:
accuracy_score(EO_prob_binary_rand, Y_test.values)


0.7902462993673607

In [38]:
predicted_classes

array([0, 0, 0, ..., 1, 0, 1])

In [39]:
Y_test.values

array([0, 0, 1, ..., 0, 0, 1])

In [40]:
train_data.drop(['sex'], axis=1, inplace=True)
test_data.drop(['sex'], axis=1, inplace=True)

In [41]:
train_copy = train_data.copy()
train_copy["income"] = train_copy["income"].apply(lambda x:0 if 
                        x=='<=50K' else 1)
X_train = train_copy.drop('income', axis =1)
Y_train = train_copy['income']

X_train_processed=full_pipeline.fit_transform(X_train)
model = LogisticRegression(random_state=0, max_iter=10000)
model.fit(X_train_processed, Y_train)

LogisticRegression(max_iter=10000, random_state=0)

In [43]:
test_copy = test_data.copy()
test_copy["income"] = test_copy["income"].apply(lambda x:0 if 
                      x=='<=50K.' else 1)
X_test = test_copy.drop('income', axis =1)
Y_test = test_copy['income']

X_test_processed = full_pipeline.fit_transform(X_test)
predicted_classes = model.predict(X_test_processed)

accuracy_score(predicted_classes, Y_test.values)


0.8516061666973773