In [10]:
#Import all of the necessary libraries in
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from folktables import ACSDataSource, ACSIncome
import math
warnings.filterwarnings('ignore')

In [2]:
#This function is being used to calculate the statistical parity of the data set
#test_data: The original data frame containing the test data
#predictors: List of the all the column names AKA IV's
#prediction_df: Contains the predicitons made by the DT model
#sensitive_attr: Name of the sensitive attr
#sensitive_attr_cutoff: Cut off value for sensitive attr
#concat_col: Name of column I am creating new dataframe on

def s_parity(test_data, predictors, prediction_df, sensitive_attr, concat_col):
    #Creating a new DF that contains all the datapoints from the test data and the predictions made from LR model
    #Concat_col: outcome
    test_demo_df = pd.DataFrame(test_data, columns = predictors)
    predicted_df = pd.DataFrame(prediction_df, columns = [concat_col])
    concat_df = pd.concat([test_demo_df,predicted_df], axis=1)

    #Get the two groups of people totals
    total_unpriv = (len(concat_df[concat_df[sensitive_attr]==0]))
    total_priv = (len(concat_df[concat_df[sensitive_attr]==1]))

    #Number of people accepted
    total_credit_unpriv = len(concat_df[(concat_df[concat_col] == 1) & (concat_df[sensitive_attr] == 0)])
    total_credit_priv = len(concat_df[(concat_df[concat_col] == 1) & (concat_df[sensitive_attr] == 1)])

    #Percentage of approved people
    p_unpriv = total_credit_unpriv/total_unpriv
    p_priv = total_credit_priv/total_priv


    #Calculate the parity
    parity = p_priv - p_unpriv


    return parity

In [3]:
#Make changes to make more efficient
#Function used to run the DT model
#train_dataset: Training dataset to train the model
#independent_var: Column names
#dependent_var: Prediction column name
#concat_col: Name of column creating new DF on
def logistic_regression(train_dataset, independent_var, dependent_var, concat_col):
        #Split the data up into train and test values and then run the DT model
        #These steps aren't neccessary to consistenly do over and over again
        x = train_dataset[independent_var].values
        y = train_dataset[dependent_var].values
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=310)
        clf = LogisticRegression(class_weight=None, max_iter=100)
        log = clf.fit(x_train,y_train)
        prediction = log.predict(x_test)

        #Printing the Overall accuracy of the model after one run
        #F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
        #file.write(f'\nAccuracy of the model on Testing Sample Data: {F1_Score}')

        #Prints out the average across all ten run throughs
        #Accuracy_Values=cross_val_score(log, x , y, cv=10, scoring='f1_weighted')

        accuracy = accuracy_score(y_test,prediction)*100

        #After running the model I return a df with the datapoints and the labels
        test_demo_df = pd.DataFrame(x_test, columns = independent_var)
        #Concat_col: credit_risk_12
        predicted_df = pd.DataFrame(prediction, columns = [concat_col])

        return accuracy, test_demo_df, predicted_df

In [None]:
from folktables import ACSDataSource, ACSIncome

states_abbr = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
    'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
    'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
    'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
    'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
]

size = []

for state in states_abbr:
    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    data = data_source.get_data(states=[state], download=True)
    features, labels, _ = ACSIncome.df_to_pandas(data)
    size.append(len(features))


In [36]:
# Combine lists into tuples and zip them
combined_data = list(zip(size, states_abbr))

# Sort based on statistical parity values in descending order
combined_data.sort(key=lambda x: abs(x[0]),)

# Extract sorted indices list and the sorted values
sorted_indices_list = [index for _, index in combined_data]
sorted_values = sorted(size)

<h3>State 1: Connecticut <h3>

In [51]:
#Before conducting analysis I need to access the data and join it together
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
data = data_source.get_data(states=['CT'], download=True)
features, labels,_= ACSIncome.df_to_pandas(data)
connecticut_df = pd.concat([features,labels], axis=1)

<h5>Code the columns <h5>

In [53]:
#After joining them I will code the different columns
#Group the SCHL columns into a single values
#1: Grade School (No Diploman)
#2: High School Diploma/GED
#3: Some College (2 Years at Most)
#4: Bachelor's Degree
#5: Master's Degree
#6: Professional Degree
#7: Doctorate Or Equivalent
ranges = {'GS': (1, 15), 'HSD': (16, 17), 'SC': (18, 20), 'BD': (21,21), 'ME': (22,22), 'PD': (23,23), 'DE': (24,24)}
group_dict = {}

def assign_group(value):
  """Assigns a group label to a value based on the ranges dictionary."""
  if value not in group_dict:
    for group_name, group_range in ranges.items():
      if group_range[0] <= value <= group_range[1]:
        group_dict[value] = group_name
        break
  return group_dict.get(value)

connecticut_df['SCHL'] = connecticut_df['SCHL'].apply(assign_group)

In [55]:
#Assign codes to occupations based on ranges
#0010-0440	11-0000	Management Occupations: 170997
#0500-0960	13-0000	Business and Financial Operations Occupations: 91842
#1005-1240	15-0000	Computer and mathematical occupations: 50817
#1305-1560	17-0000	Architecture and Engineering Occupations: 31718
#1600-1980	19-0000	Life, Physical, and Social Science Occupations: 16529
#2001-2970	21-0000 - 27-0000	Education, Legal, Community Service, Arts, and Media Occupations: 193762
#3000-3550	29-0000	Healthcare Practitioners and Technical Occupations: 100986
#3601-4655	31-0000 - 39-0000	Service Occupations: 283912
#4700-5940	41-0000 - 43-0000	Sales and Office Occupations: 358340
#6005-7640	45-0000 - 49-0000	Natural Resources, Construction, and Maintenance Occupations: 143613
#7700-9760	51-0000 - 53-0000	Production, Transportation, and Material Moving Occupations: 216280
#9800-9920                      Military Specific Occupations: 5704

ranges = {
    'management': (10,440),
    'business': (500,960),
    'computer': (1005,1240),
    'engineering': (1305,1560),
    'life': (1600,1980),
    'education_arts': (2001,2970),
    'healthcare': (3000,3550),
    'service': (3601,4655),
    'sales': (4700,5940),
    'environmental_construction': (6005,7640),
    'production_transportation': (7700,9760),
    'military': (9800,9920)
}
group_dict = {}
connecticut_df['OCCP'] = connecticut_df['OCCP'].apply(assign_group)

In [56]:
#Add family codes 
#1-7: Family
#8-10: inlaws and other family
#11-17: Non-Family
ranges = {
    'family': (0,7),
    'inlaws/other': (8,10),
    'non-family': (11,17)
}
group_dict = {}
connecticut_df['RELP'] = connecticut_df['RELP'].apply(assign_group)

In [57]:
#Code the Class of worker column from numeric to strings
connecticut_df['COW'] = connecticut_df['COW'].replace(1, 'private_business')
connecticut_df['COW'] = connecticut_df['COW'].replace(2, 'non-profit')
connecticut_df['COW'] = connecticut_df['COW'].replace(3, 'local_gov')
connecticut_df['COW'] = connecticut_df['COW'].replace(4, 'state_gov')
connecticut_df['COW'] = connecticut_df['COW'].replace(5, 'federal_gov')
connecticut_df['COW'] = connecticut_df['COW'].replace(6, 'SE_no_business')
connecticut_df['COW'] = connecticut_df['COW'].replace(7, 'SE_business')
connecticut_df['COW'] = connecticut_df['COW'].replace(8, 'no_pay_work')
connecticut_df['COW'] = connecticut_df['COW'].replace(9, 'unemployed')

In [58]:
#Class the Marital Status from numeric to strings
connecticut_df['MAR'] = connecticut_df['MAR'].replace(1, 'married')
connecticut_df['MAR'] = connecticut_df['MAR'].replace(2, 'widowed')
connecticut_df['MAR'] = connecticut_df['MAR'].replace(3, 'divorced')
connecticut_df['MAR'] = connecticut_df['MAR'].replace(4, 'seperated')
connecticut_df['MAR'] = connecticut_df['MAR'].replace(5, 'never_married')

In [59]:
#Code the sex column from numeric to string
connecticut_df['SEX'] = connecticut_df['SEX'].replace(1, 'male')
connecticut_df['SEX'] = connecticut_df['SEX'].replace(2, 'female')

In [60]:
#Code the race column from numeric to string
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(1, 'white')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(2, 'black')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(3, 'american_indian')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(4, 'alaska_native')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(5, 'native_american')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(6, 'asian')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(7, 'native_hawaiian')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(8, 'other')
connecticut_df['RAC1P'] = connecticut_df['RAC1P'].replace(9, 'mixed_race')

In [64]:
connecticut_df = connecticut_df.drop(columns=['AGEP','RAC1P'])

In [73]:
encoded_data = pd.get_dummies(connecticut_df, columns=['RELP','COW','SCHL','MAR','OCCP','POBP','SEX'])

In [74]:
encoded_data

Unnamed: 0,WKHP,PINCP,RELP_family,RELP_inlaws/other,RELP_non-family,COW_SE_business,COW_SE_no_business,COW_federal_gov,COW_local_gov,COW_no_pay_work,...,POBP_429.0,POBP_436.0,POBP_440.0,POBP_442.0,POBP_444.0,POBP_457.0,POBP_459.0,POBP_501.0,SEX_female,SEX_male
0,18.0,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,2.0,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,40.0,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,29.0,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,16.0,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,50.0,True,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2996,50.0,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2997,55.0,False,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2998,55.0,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


<h5>Build the Model and get Parity and Accuracy<h5>

In [82]:
#Get the columns for the predictors and target variable
acs_columns_list = list(encoded_data.columns)

predictors = [item for item in acs_columns_list if item != 'PINCP']
target_var = 'PINCP'

#Setting the columns to each part of the logistic regression
#x conatins the IV's
#y contains the DV
x = encoded_data[predictors].values
y = encoded_data[target_var].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=310)
#X-train is all the data points for training
#y_train contains the labels for each of the training data points
#x_test contains all the testing data points
#y_test contains the ground truth for each of the test data points

#Run the LR classifier
clf = LogisticRegression(class_weight=None, max_iter=100)
lr = clf.fit(x_train,y_train)
prediction = lr.predict(x_test)
prediction_prob = lr.predict_proba(x_train)
accuracy = accuracy_score(y_test,prediction)*100
print(accuracy)


82.5


In [83]:
#Break the test dataframe into different groups
#Unprotected group is everyone who is a female or 2
#Complement group is everyone who is a male or 1
test_demo_df = pd.DataFrame(x_test, columns = predictors)
predicted_df = pd.DataFrame(prediction, columns = ['income'])
concat_df = pd.concat([test_demo_df,predicted_df], axis=1)

#Get the two groups of people totals
total_unpriv = (len(concat_df[concat_df['SEX_male']==0]))
total_priv = (len(concat_df[concat_df['SEX_male']==1]))

#Number of people accepted
total_credit_unpriv = len(concat_df[(concat_df['income'] == 1) & (concat_df['SEX_male'] == 0)])
total_credit_priv = len(concat_df[(concat_df['income'] == 1) & (concat_df['SEX_male'] == 1)])

#Percentage of approved people
p_unpriv = total_credit_unpriv/total_unpriv
p_priv = total_credit_priv/total_priv

statistical_parity = p_priv - p_unpriv
print(f'Statistical Parity: {statistical_parity:.5f}')

Statistical Parity: 0.16867


<h5>Iterative Flipping<h5>

In [77]:
#Need to build a new dataframe for training and testing purposes for iterative flipping 
#Combine x_train and y_train
#I combine these so that when I flip the labels I have one DF that I work with
#Rather than 2 numpy.ndarrays
train_demo_df = pd.DataFrame(x_train, columns = predictors)
train_outcome_df = pd.DataFrame(y_train, columns = ['PINCP'])
train_full_df = pd.concat([train_demo_df, train_outcome_df], axis=1)

In [87]:
train_outcome_df

Unnamed: 0,PINCP
0,True
1,False
2,False
3,False
4,True
...,...
2395,True
2396,True
2397,True
2398,False


In [88]:
train_full_df.loc[train_full_df['PINCP'] == 1, 'PINCP'] = True

In [91]:
train_full_df['PINCP'] = train_full_df['PINCP'].replace(True, 1)
train_full_df['PINCP'] = train_full_df['PINCP'].replace(False, 0)

In [92]:
train_full_df

Unnamed: 0,WKHP,RELP_family,RELP_inlaws/other,RELP_non-family,COW_SE_business,COW_SE_no_business,COW_federal_gov,COW_local_gov,COW_no_pay_work,COW_non-profit,...,POBP_436.0,POBP_440.0,POBP_442.0,POBP_444.0,POBP_457.0,POBP_459.0,POBP_501.0,SEX_female,SEX_male,PINCP
0,50.0,True,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,0
1,19.0,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,0
2,65.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,0
3,40.0,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,0
4,40.0,False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,50.0,True,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,1
2396,40.0,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,1
2397,50.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,1
2398,35.0,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,0


In [93]:
#When flipping the label make sure convert the flip before into its original value

#Combine x_train and y_train
#Retrain the model after combining them and then flip and calculate
#Implement label flipping and recalculate

#Each list holds a different value

#list_parity holds each parity value after each flip and recalculation
list_parity = []
#list_acc holds the accuracy of each iteration after a flip
list_acc = []
#list_flip holds the row number of the flip; starts at 0 and goes through the whole
list_flip = []

#Iterating through the training dataset
for index, row in train_full_df.iterrows():
  #If the row that I am on has the label 1 (credit_risk is the label name) then I will flip it
  #And then run the logistic_regression function to get the accuracy, the DF that contains datapoints,
  #And the DF that contains the predictions
  #The two DF's are then combined in the s_parity function to calculate parity
    if row['PINCP'] == 1:
      #Flip the label
        train_full_df.at[index, 'PINCP'] = 0

        #Run the logistic regression function
        #train_full_df: training dataset
        #predictors: IV's
        #target_var: DV's
        #'credit_risk12': this is the column that the labels for the test_prediction is in
        accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df, predictors, target_var, 'income')

        #list_flip: contains the row number that has just been flipped
        #list_num: contains the accuracy value just calculated
        list_flip.append(index)
        list_acc.append(accuracy)

        ##################################################################################
        #After calculating the accuracy parity calculation is next
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
        #list_parity: contains the parity value after the flip
        #print(parity)
        list_parity.append(parity)

        #Flips the label back to its original value
        train_full_df.at[index,'PINCP'] = 1

    #Repeats all the same steps as above but this time if the original label is 2
    elif row['PINCP'] == 0:
        train_full_df.at[index, 'PINCP'] = 1

        accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df, predictors, target_var, 'income')
        list_flip.append(index)
        list_acc.append(accuracy)

        ##################################################################################
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
        #print(parity)
        list_parity.append(parity)

        train_full_df.at[index,'PINCP'] = 0