<a href="https://colab.research.google.com/github/s-thandri/labelflipping/blob/main/acs_income.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Import the libraries and build the parity and LR function <h1>

In [1]:
#Import all of the necessary libraries in
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import math
from scipy.io import arff
warnings.filterwarnings('ignore')

In [2]:
#This function is being used to calculate the statistical parity of the data set
#test_data: The original data frame containing the test data
#predictors: List of the all the column names AKA IV's
#prediction_df: Contains the predicitons made by the DT model
#sensitive_attr: Name of the sensitive attr
#sensitive_attr_cutoff: Cut off value for sensitive attr
#concat_col: Name of column I am creating new dataframe on

def s_parity(test_data, predictors, prediction_df, sensitive_attr, concat_col):
    #Creating a new DF that contains all the datapoints from the test data and the predictions made from LR model
    #Concat_col: outcome
    test_demo_df = pd.DataFrame(test_data, columns = predictors)
    predicted_df = pd.DataFrame(prediction_df, columns = [concat_col])
    concat_df = pd.concat([test_demo_df,predicted_df], axis=1)

    #Get the two groups of people totals
    total_unpriv = (len(concat_df[concat_df[sensitive_attr]==0]))
    total_priv = (len(concat_df[concat_df[sensitive_attr]==1]))

    #Number of people accepted
    total_credit_unpriv = len(concat_df[(concat_df[concat_col] == 1) & (concat_df[sensitive_attr] == 0)])
    total_credit_priv = len(concat_df[(concat_df[concat_col] == 1) & (concat_df[sensitive_attr] == 1)])

    #Percentage of approved people
    p_unpriv = total_credit_unpriv/total_unpriv
    p_priv = total_credit_priv/total_priv


    #Calculate the parity
    parity = p_priv - p_unpriv


    return parity

In [3]:
#Make changes to make more efficient
#Function used to run the DT model
#train_dataset: Training dataset to train the model
#independent_var: Column names
#dependent_var: Prediction column name
#concat_col: Name of column creating new DF on
def logistic_regression(train_dataset, independent_var, dependent_var, concat_col):
        #Split the data up into train and test values and then run the DT model
        #These steps aren't neccessary to consistenly do over and over again
        x = train_dataset[independent_var].values
        y = train_dataset[dependent_var].values
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=310)
        clf = LogisticRegression(class_weight=None, max_iter=100)
        log = clf.fit(x_train,y_train)
        prediction = log.predict(x_test)

        #Printing the Overall accuracy of the model after one run
        #F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
        #file.write(f'\nAccuracy of the model on Testing Sample Data: {F1_Score}')

        #Prints out the average across all ten run throughs
        #Accuracy_Values=cross_val_score(log, x , y, cv=10, scoring='f1_weighted')

        accuracy = accuracy_score(y_test,prediction)*100

        #After running the model I return a df with the datapoints and the labels
        test_demo_df = pd.DataFrame(x_test, columns = independent_var)
        #Concat_col: credit_risk_12
        predicted_df = pd.DataFrame(prediction, columns = [concat_col])

        return accuracy, test_demo_df, predicted_df

<h1>Load the data into the dataframe <h1>

In [4]:
#Load all the data into the acs_dataframe
acs_dataframe = pd.read_csv('acs_data/acs_income.csv', index_col=None, sep=',')

In [5]:
acs_dataframe

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST,PINCP
0,18.0,1.0,18.0,5.0,4720.0,13.0,17.0,21.0,2.0,2.0,1.0,1600.0
1,53.0,5.0,17.0,5.0,3605.0,18.0,16.0,40.0,1.0,1.0,1.0,10000.0
2,41.0,1.0,16.0,5.0,7330.0,1.0,17.0,40.0,1.0,1.0,1.0,24000.0
3,18.0,6.0,18.0,5.0,2722.0,1.0,17.0,2.0,2.0,1.0,1.0,180.0
4,21.0,5.0,19.0,5.0,3870.0,12.0,17.0,50.0,1.0,1.0,1.0,29000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1664495,39.0,6.0,16.0,5.0,6260.0,72.0,0.0,20.0,1.0,1.0,72.0,9600.0
1664496,38.0,6.0,14.0,5.0,4251.0,72.0,0.0,32.0,1.0,8.0,72.0,2400.0
1664497,37.0,1.0,19.0,3.0,7750.0,17.0,13.0,40.0,2.0,9.0,72.0,19700.0
1664498,47.0,1.0,16.0,1.0,8990.0,72.0,1.0,40.0,1.0,8.0,72.0,18700.0


<h3> Conducting basic information gathering on a dataset <h3>
<h4> Important information about the dataset regarding columns <h4>
<ul>
<li>AGEP: Age</li>
<li>COW: Class of Worker</li>
<li>SCHL: Educational Attainment</li>
<li>MAR: Marital Status</li>
<li>OCCP: Occupation</li>
<li>POBP: Place of Birth</li>
<li>RELP: Relationship to Householders</li>
<li>WKHP: Usual Hours worked per week</li>
<li>SEX: Sex</li>
<li>RAC1P: Race</li>
<li>ST: State Codes</li>
<li>PINCP: Total Annual Income</li>
</ul>



In [6]:
#Fixing the noation 
pd.set_option('display.float_format', lambda x: '%.2f' % x)  # Set decimal precision


In [7]:
#Describe the dataset
acs_dataframe.describe()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST,PINCP
count,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0
mean,43.41,2.08,18.62,2.52,4180.52,65.82,2.24,38.33,1.48,1.87,28.13,56663.86
std,15.3,1.83,3.3,1.8,2658.72,93.06,4.39,13.08,0.5,2.08,16.32,73067.45
min,17.0,1.0,1.0,1.0,10.0,1.0,0.0,1.0,1.0,1.0,1.0,104.0
25%,30.0,1.0,16.0,1.0,2205.0,18.0,0.0,35.0,1.0,1.0,12.0,20000.0
50%,43.0,1.0,19.0,1.0,4200.0,36.0,1.0,40.0,1.0,1.0,28.0,39000.0
75%,56.0,3.0,21.0,5.0,5740.0,48.0,2.0,44.0,2.0,1.0,42.0,68000.0
max,96.0,8.0,24.0,5.0,9830.0,554.0,17.0,99.0,2.0,9.0,72.0,1423000.0


In [8]:
#Understand the spread of the OCCUPATIONS
#0010-0440	11-0000	Management Occupations: 170997
#0500-0960	13-0000	Business and Financial Operations Occupations: 91842
#1005-1240	15-0000	Computer and mathematical occupations: 50817
#1305-1560	17-0000	Architecture and Engineering Occupations: 31718
#1600-1980	19-0000	Life, Physical, and Social Science Occupations: 16529
#2001-2970	21-0000 - 27-0000	Education, Legal, Community Service, Arts, and Media Occupations: 193762
#3000-3550	29-0000	Healthcare Practitioners and Technical Occupations: 100986
#3601-4655	31-0000 - 39-0000	Service Occupations: 283912
#4700-5940	41-0000 - 43-0000	Sales and Office Occupations: 358340
#6005-7640	45-0000 - 49-0000	Natural Resources, Construction, and Maintenance Occupations: 143613
#7700-9760	51-0000 - 53-0000	Production, Transportation, and Material Moving Occupations: 216280
#9800-9920                      Military Specific Occupations: 5704
((acs_dataframe['OCCP'] >= 9800) & (acs_dataframe['OCCP'] <= 9920)).sum()

5704

<h4>Level of Education Grouped<h4>
<ul>
<li>Grade School (No Diploma): 109,882</li>
<li>High School Diploma/GED: 400,706 </li>
<li>Some College (2 Years at Most): 531,044 </li>
<li>Bachelor's Degree: 366,380</li>
<li>Master's Degree: 160,594 </li>
<li>Professional Degree: 41,426</li>
<li>Doctorate Or Equivalent: 27530</li>
</ul>

<h2>Group Info in columns to ranges so there is less unique values for making Dummies <h2>

In [9]:
#Group the SCHL columns into a single values
#1: Grade School (No Diploman)
#2: High School Diploma/GED
#3: Some College (2 Years at Most)
#4: Bachelor's Degree
#5: Master's Degree
#6: Professional Degree
#7: Doctorate Or Equivalent
ranges = {'GS': (1, 15), 'HSD': (16, 17), 'SC': (18, 20), 'BD': (21,21), 'ME': (22,22), 'PD': (23,23), 'DE': (24,24)}
group_dict = {}

def assign_group(value):
  """Assigns a group label to a value based on the ranges dictionary."""
  if value not in group_dict:
    for group_name, group_range in ranges.items():
      if group_range[0] <= value <= group_range[1]:
        group_dict[value] = group_name
        break
  return group_dict.get(value)

acs_dataframe['SCHL'] = acs_dataframe['SCHL'].apply(assign_group)

In [10]:
#Assign codes to occupations based on ranges
#0010-0440	11-0000	Management Occupations: 170997
#0500-0960	13-0000	Business and Financial Operations Occupations: 91842
#1005-1240	15-0000	Computer and mathematical occupations: 50817
#1305-1560	17-0000	Architecture and Engineering Occupations: 31718
#1600-1980	19-0000	Life, Physical, and Social Science Occupations: 16529
#2001-2970	21-0000 - 27-0000	Education, Legal, Community Service, Arts, and Media Occupations: 193762
#3000-3550	29-0000	Healthcare Practitioners and Technical Occupations: 100986
#3601-4655	31-0000 - 39-0000	Service Occupations: 283912
#4700-5940	41-0000 - 43-0000	Sales and Office Occupations: 358340
#6005-7640	45-0000 - 49-0000	Natural Resources, Construction, and Maintenance Occupations: 143613
#7700-9760	51-0000 - 53-0000	Production, Transportation, and Material Moving Occupations: 216280
#9800-9920                      Military Specific Occupations: 5704

ranges = {
    'management': (10,440),
    'business': (500,960),
    'computer': (1005,1240),
    'engineering': (1305,1560),
    'life': (1600,1980),
    'education_arts': (2001,2970),
    'healthcare': (3000,3550),
    'service': (3601,4655),
    'sales': (4700,5940),
    'environmental_construction': (6005,7640),
    'production_transportation': (7700,9760),
    'military': (9800,9920)
}
group_dict = {}
acs_dataframe['OCCP'] = acs_dataframe['OCCP'].apply(assign_group)

In [11]:
#Dont group the states because there is no reason
#Ask if I should group it but probably wont need to

In [11]:
#Add family codes 
#1-7: Family
#8-10: inlaws and other family
#11-17: Non-Family
ranges = {
    'family': (0,7),
    'inlaws/other': (8,10),
    'non-family': (11,17)
}
group_dict = {}
acs_dataframe['RELP'] = acs_dataframe['RELP'].apply(assign_group)

In [12]:
#Code the Class of worker column from numeric to strings
acs_dataframe['COW'] = acs_dataframe['COW'].replace(1, 'private_business')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(2, 'non-profit')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(3, 'local_gov')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(4, 'state_gov')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(5, 'federal_gov')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(6, 'SE_no_business')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(7, 'SE_business')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(8, 'no_pay_work')
acs_dataframe['COW'] = acs_dataframe['COW'].replace(9, 'unemployed')


In [13]:
#Class the Marital Status from numeric to strings
acs_dataframe['MAR'] = acs_dataframe['MAR'].replace(1, 'married')
acs_dataframe['MAR'] = acs_dataframe['MAR'].replace(2, 'widowed')
acs_dataframe['MAR'] = acs_dataframe['MAR'].replace(3, 'divorced')
acs_dataframe['MAR'] = acs_dataframe['MAR'].replace(4, 'seperated')
acs_dataframe['MAR'] = acs_dataframe['MAR'].replace(5, 'never_married')

In [14]:
#Code the sex column from numeric to string
acs_dataframe['SEX'] = acs_dataframe['SEX'].replace(1, 'male')
acs_dataframe['SEX'] = acs_dataframe['SEX'].replace(2, 'female')

In [15]:
#Code the race column from numeric to string
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(1, 'white')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(2, 'black')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(3, 'american_indian')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(4, 'alaska_native')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(5, 'native_american')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(6, 'asian')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(7, 'native_hawaiian')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(8, 'other')
acs_dataframe['RAC1P'] = acs_dataframe['RAC1P'].replace(9, 'mixed_race')

In [16]:
#Code the PINCP column so that it is binary
ranges = {
    0: (0,80000),
    1: (80001,2000000)
}
group_dict = {}
acs_dataframe['PINCP'] = acs_dataframe['PINCP'].apply(assign_group)

<h4>Build the Model to check accuracy and original parity<h4>

In [17]:
#Encode the non-numeric columns
columns_to_encode = ['AGEP','COW','SCHL','MAR','OCCP','POBP','RELP','SEX','RAC1P']
encoded_data = pd.get_dummies(acs_dataframe, columns=columns_to_encode, drop_first=True)

In [18]:
#Get the columns for the predictors and target variable
acs_columns_list = list(encoded_data.columns)

predictors = [item for item in acs_columns_list if item != 'PINCP']
target_var = 'PINCP'

In [19]:
#Setting the columns to each part of the logistic regression
#x conatins the IV's
#y contains the DV
x = encoded_data[predictors].values
y = encoded_data[target_var].values

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=310)
#X-train is all the data points for training
#y_train contains the labels for each of the training data points
#x_test contains all the testing data points
#y_test contains the ground truth for each of the test data points

In [21]:
#Run the LR classifier
clf = LogisticRegression(class_weight=None, max_iter=100)
lr = clf.fit(x_train,y_train)
prediction = lr.predict(x_test)
prediction_prob = lr.predict_proba(x_train)
accuracy = accuracy_score(y_test,prediction)*100
print(accuracy)

85.9954941423851


In [22]:
#Break the test dataframe into different groups
#Unprotected group is everyone who is a female or 2
#Complement group is everyone who is a male or 1
test_demo_df = pd.DataFrame(x_test, columns = predictors)
predicted_df = pd.DataFrame(prediction, columns = ['income'])
concat_df = pd.concat([test_demo_df,predicted_df], axis=1)

#Get the two groups of people totals
total_unpriv = (len(concat_df[concat_df['SEX_male']==0]))
total_priv = (len(concat_df[concat_df['SEX_male']==1]))

#Number of people accepted
total_credit_unpriv = len(concat_df[(concat_df['income'] == 1) & (concat_df['SEX_male'] == 0)])
total_credit_priv = len(concat_df[(concat_df['income'] == 1) & (concat_df['SEX_male'] == 1)])

#Percentage of approved people
p_unpriv = total_credit_unpriv/total_unpriv
p_priv = total_credit_priv/total_priv

statistical_parity = p_priv - p_unpriv
print(f'Statistical Parity: {statistical_parity:.5f}')

Statistical Parity: 0.13554


<h3>Implement the Different Solutions now<h3>
<ul>
<li>Iterative Flipping</li>
<li>Uncertainty Reduction</li>
<li>Model Uncertainty</li>
</ul>

In [29]:
#Need to build a new dataframe for training and testing purposes for iterative flipping 
#Combine x_train and y_train
#I combine these so that when I flip the labels I have one DF that I work with
#Rather than 2 numpy.ndarrays
train_demo_df = pd.DataFrame(x_train, columns = predictors)
train_outcome_df = pd.DataFrame(y_train, columns = ['PINCP'])
train_full_df = pd.concat([train_demo_df, train_outcome_df], axis=1)

In [27]:
train_full_df

Unnamed: 0,WKHP,ST,AGEP_18.0,AGEP_19.0,AGEP_20.0,AGEP_21.0,AGEP_22.0,AGEP_23.0,AGEP_24.0,AGEP_25.0,...,SEX_male,RAC1P_american_indian,RAC1P_asian,RAC1P_black,RAC1P_mixed_race,RAC1P_native_american,RAC1P_native_hawaiian,RAC1P_other,RAC1P_white,PINCP
0,40.00,12.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,0
1,40.00,40.00,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,0
2,53.00,34.00,False,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,False,False,1
3,45.00,30.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,1
4,40.00,30.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331595,44.00,21.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,0
1331596,40.00,48.00,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,0
1331597,40.00,15.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,0
1331598,25.00,25.00,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,0


In [None]:
#When flipping the label make sure convert the flip before into its original value

#Combine x_train and y_train
#Retrain the model after combining them and then flip and calculate
#Implement label flipping and recalculate

#Each list holds a different value

#list_parity holds each parity value after each flip and recalculation
list_parity = []
#list_acc holds the accuracy of each iteration after a flip
list_acc = []
#list_flip holds the row number of the flip; starts at 0 and goes through the whole
list_flip = []

#Iterating through the training dataset
for index, row in train_full_df.iterrows():
  #If the row that I am on has the label 1 (credit_risk is the label name) then I will flip it
  #And then run the logistic_regression function to get the accuracy, the DF that contains datapoints,
  #And the DF that contains the predictions
  #The two DF's are then combined in the s_parity function to calculate parity
    if row['PINCP'] == 1:
      #Flip the label
        train_full_df.at[index, 'PINCP'] = 0

        #Run the logistic regression function
        #train_full_df: training dataset
        #predictors: IV's
        #target_var: DV's
        #'credit_risk12': this is the column that the labels for the test_prediction is in
        accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df, predictors, target_var, 'income')

        #list_flip: contains the row number that has just been flipped
        #list_num: contains the accuracy value just calculated
        list_flip.append(index)
        list_acc.append(accuracy)

        ##################################################################################
        #After calculating the accuracy parity calculation is next
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
        #list_parity: contains the parity value after the flip
        #print(parity)
        list_parity.append(parity)

        #Flips the label back to its original value
        train_full_df.at[index,'PINCP'] = 1

    #Repeats all the same steps as above but this time if the original label is 2
    elif row['PINCP'] == 0:
        train_full_df.at[index, 'PINCP'] = 1

        accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df, predictors, target_var, 'income')
        list_flip.append(index)
        list_acc.append(accuracy)

        ##################################################################################
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
        #print(parity)
        list_parity.append(parity)

        train_full_df.at[index,'PINCP'] = 0

In [None]:
#parity_difference: Contains the difference between flipped and original value
parity_difference = []
#Contains parity values that fall between .03 and -.03
large_influence = []
#Contains the row number of each flip
li_row = []

#Takes each of the parity values after flipping and compares it to the original parity value
#Appends the difference to a new list
for value in list_parity:
    difference = abs(value) - abs(statistical_parity)
    parity_difference.append(difference)


#Rows are considered to be high influence if they fall in between
for index, item in enumerate(parity_difference):
  if statistical_parity > 0:
    if item <= statistical_parity or item >= -1*(statistical_parity):
        large_influence.append(item)
        li_row.append(index)
  else:
    if item >= statistical_parity or item <= -1*(statistical_parity):
        large_influence.append(item)
        li_row.append(index)


In [None]:
# Combine lists into tuples and zip them
combined_data = list(zip(large_influence, li_row))

# Sort based on statistical parity values in descending order
combined_data.sort(key=lambda x: abs(x[0]), reverse=True)

# Extract sorted indices list and the sorted values
sorted_indices_list = [index for _, index in combined_data]
sorted_values = sorted(large_influence, reverse=True)

In [None]:
#After ranking the values it is time to now go back through the flipping process but flip the top 500 labels based on their ranking
#Parity values after each flip
ranked_parity = []
#Accuracy after every flip
ranked_acc = []
#Index/row of the flip
ranked_flip = []

row_num = 0
#sorted_indices is a tuple so it's coverted to a list
ranked_indices_list = list(sorted_indices_list)

#Iterate through the ranked indices and start flipping labels based on their position in the list
for row in range(len(ranked_indices_list)):
  #row_num: contains the row number with respect to the dataframe
  row_num = ranked_indices_list[row]
  #If the label at the specific row is 1 it flips it to 2 then calculates the parity and accuracy
  #It follows the same steps as the flipping before
  if train_full_df.at[row_num,'credit_risk'] == 1:
      train_full_df.at[row_num, 'credit_risk'] = 0

      accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df, predictors, target_var, 'income')

      ranked_flip.append(row_num)
      ranked_acc.append(accuracy)

      ##################################################################################
      parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
      ranked_parity.append(parity)

      #train_full_df.at[index,'credit_risk'] = 1

  elif train_full_df.at[row_num,'credit_risk'] == 1:
      train_full_df.at[row_num, 'credit_risk'] = 0

      accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df, predictors, target_var, 'income')

      ranked_flip.append(row_num)
      ranked_acc.append(accuracy)

      ##################################################################################
      parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
      ranked_parity.append(parity)

      #train_full_df.at[index,'credit_risk'] = 2




In [None]:
#Takes the first 150 parity values and their indices and will graph those
top_ranked_parity = ranked_parity[:]
top_ranked_index = ranked_flip[:]

In [None]:
#Place the original parity at the beginning of the list
top_ranked_parity.insert(0,statistical_parity)

<h4> Entropy/Uncertainty Reduction <h4>

In [None]:
#Create new training dataset
train_demo_df_entropy = pd.DataFrame(x_train, columns = predictors)
train_outcome_df_entropy = pd.DataFrame(y_train, columns = ['income'])
train_full_df_entropy = pd.concat([train_demo_df, train_outcome_df], axis=1)

In [None]:
# Obtain predicted probabilities on the training data
predicted_probabilities = lr.predict_proba(x_train)

In [None]:
#Log base two would be: log = math.log(x,2)
#Entropy equation would then be: -1*(p1((math.log(p1,2))+p2*(math.log(p2,2)))

#In the predicted_probabilities: first value is 1 (which is good), second value is 2(which is bad)

#Loop through the predicted_probabilities array and calculate values
#One list will contain the index of each row in the table and the other will contain the entropy value of each row

index_list = []
entropy_list = []

for index, value in enumerate(predicted_probabilities):
  #Calculate p1 and p2
  p1=predicted_probabilities[index,0]
  p2=predicted_probabilities[index,1]
  #Calculate entropy in 3 steps
  entropy_calc1 = p1 * math.log(p1,2)
  entropy_calc2 = p2 * math.log(p2,2)
  entropy_final = -1 * (entropy_calc1 + entropy_calc2)

  #Append the values to their respective lists
  index_list.append(index)
  entropy_list.append(entropy_final)

In [None]:
#Rank the entropy values in decreasing order
#Match the index list with them

#Combine the two lists using zip
combined_list = list(zip(entropy_list, index_list))

#Sort the combined lust based on entropy values in descending order
sorted_list = sorted(combined_list, key = lambda x: x[0], reverse=True)

#Extract the values
sorted_ent_list, sorted_index_list = zip(*sorted_list)

In [None]:
train_full_df_entropy['income'] = train_full_df_entropy['income'].astype('int')

In [None]:
#Convert each of the tuples into lists
ranked_indices_list = list(sorted_index_list)
ranked_entropy_list = list(sorted_ent_list)

In [None]:
#After ranking and getting the tope 500 points I flip them based on where they are in the list

#Different lists to hold each of the values
ranked_parity = [statistical_parity]
ranked_acc = []
ranked_flip = []

row_num = 0

#Go through the training dataset and flip the points based on their entropy levels
for row in range(len(sorted_index_list)):
  row_num = ranked_indices_list[row]

  if train_full_df_entropy.at[row_num,'income'] == 1:
    train_full_df_entropy.at[row_num,'credit_risk'] = 0
    accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df_entropy, predictors, target_var, 'income')

    ranked_flip.append(row_num)
    ranked_acc.append(accuracy)

    parity = s_parity(test_datapoints, predictors, test_prediction, 'age', 45, 'credit_risk_12')
    ranked_parity.append(parity)

  elif train_full_df_entropy.at[row_num, 'credit_risk'] == 2:
    train_full_df_entropy.at[row_num,'credit_risk'] = 1
    accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df_entropy, predictors, target_var, 'credit_risk_12')

    ranked_flip.append(row_num)
    ranked_acc.append(accuracy)

    parity = s_parity(test_datapoints, predictors, test_prediction, 'age', 45, 'credit_risk_12')
    ranked_parity.append(parity)


<h4>Maximum Expected Utility<h4>

In [None]:
#Create a fresh training dataset
train_demo_df_MEU = pd.DataFrame(x_train, columns = predictors)
train_outcome_df_MEU = pd.DataFrame(y_train, columns = ['income'])
train_full_df_MEU = pd.concat([train_demo_df, train_outcome_df], axis=1)

In [None]:
# Obtain predicted probabilities on the training data
predicted_probabilities = lr.predict_proba(x_train)

In [None]:
#Make a copy of the original training dataframe to make changes on
copy_train_full_df = train_full_df_MEU.copy()

In [None]:
#Grab the original label of the DP in the copy dataframe and store in lists along with the index
label = []
row_index = []
for index, row in copy_train_full_df.iterrows():
  label.append(copy_train_full_df.at[index,'income'])
  row_index.append(index)

In [None]:
#Calculate the first part of the MEU equation
MEU_part1_list = []
for index, value in enumerate(label):
  if value == 1:
    prob1 = predicted_probabilities[index,0]
    MEU_part1_calc = prob1 * statistical_parity
    MEU_part1_list.append(MEU_part1_calc)
  elif value == 2:
    prob2 = predicted_probabilities[index,1]
    MEU_part1_calc = prob2 * statistical_parity
    MEU_part1_list.append(MEU_part1_calc)

In [None]:
#Iterate through the copied dataset
#Flip each label and calculate the parity

#When flipping the label make sure convert the flip before into its original value

#Each list holds a different value

#list_parity holds each parity value after each flip and recalculation
MEU_part2_list_parity = []
#list_acc holds the accuracy of each iteration after a flip
MEU_part2_list_acc = []
#list_flip holds the row number of the flip; starts at 0 and goes through the whole
MEU_part2_list_flip = []

#Iterating through the training dataset
for index, row in copy_train_full_df.iterrows():
  #If the row that I am on has the label 1 (credit_risk is the label name) then I will flip it
  #And then run the logistic_regression function to get the accuracy, the DF that contains datapoints,
  #And the DF that contains the predictions
  #The two DF's are then combined in the s_parity function to calculate parity
    if row['income'] == 1:
      #Flip the label
        copy_train_full_df.at[index, 'income'] = 0

        #Run the logistic regression function
        #train_full_df: training dataset
        #predictors: IV's
        #target_var: DV's
        #'credit_risk12': this is the column that the labels for the test_prediction is in
        accuracy,test_datapoints, test_prediction = logistic_regression(copy_train_full_df, predictors, target_var, 'income_01')

        #list_flip: contains the row number that has just been flipped
        #list_num: contains the accuracy value just calculated
        MEU_part2_list_flip.append(index)
        MEU_part2_list_acc.append(accuracy)

        ##################################################################################
        #After calculating the accuracy parity calculation is next
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income_01')
        #list_parity: contains the parity value after the flip
        MEU_part2_list_parity.append(parity)

        #Flips the label back to its original value
        copy_train_full_df.at[index,'income'] = 1

    #Repeats all the same steps as above but this time if the original label is 2
    elif row['income'] == 0:
        copy_train_full_df.at[index, 'income'] = 1

        accuracy,test_datapoints, test_prediction = logistic_regression(copy_train_full_df, predictors, target_var, 'income_01')
        MEU_part2_list_flip.append(index)
        MEU_part2_list_acc.append(accuracy)

        ##################################################################################
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 45, 'income_01')
        MEU_part2_list_parity.append(parity)

        copy_train_full_df.at[index,'income'] = 2

In [None]:
#Second part of the calculation by taking the opposite value
MEU_part2_list = []
for index, value in enumerate(label):
  if value == 1:
    prob1 = predicted_probabilities[index,1]
    MEU_part2_calc = prob1 * MEU_part2_list_parity[index]
    MEU_part2_list.append(MEU_part2_calc)
  elif value == 2:
    prob2 = predicted_probabilities[index,0]
    MEU_part2_calc = prob2 * MEU_part2_list_parity[index]
    MEU_part2_list.append(MEU_part2_calc)

In [None]:
#Add the two lists together
expected_utility_values = []
for part1, part2 in zip(MEU_part1_list, MEU_part2_list):
    expected_utility_values.append(part1 + part2)

In [None]:
#After adding the two lists together I need do the following calculation:
#EU - OG Parity = New ranked value to flip on
new_ranked_value = []

for value in expected_utility_values:
  new_ranked_value.append(value - statistical_parity)

In [None]:
#Flip the lists so that they are in increasing order
#Rank the MEUvalues in Increasing order
#Match the index list with them

#Combine the two lists using zip
combined_list = list(zip(new_ranked_value, row_index))

#Sort the combined lust based on EU values in increasing order
sorted_list = sorted(combined_list, key = lambda x: abs(x[0]), reverse = True)

#Extract the values
sorted_eu_list, sorted_index_list = zip(*sorted_list)

In [None]:
#After ranking and getting the tope 500 points I flip them based on where they are in the list

#Different lists to hold each of the values
MEU_ranked_parity = [statistical_parity]
MEU_ranked_acc = []
MEU_ranked_flip = []

row_num = 0

#Go through the training dataset and flip the points based on their entropy levels
for row in range(len(sorted_index_list)):
  row_num = sorted_index_list[row]

  if train_full_df_MEU.at[row_num,'PINCP'] == 1:
    train_full_df_MEU.at[row_num,'PINCP'] = 0
    accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df_MEU, predictors, target_var, 'income')

    MEU_ranked_flip.append(row_num)
    MEU_ranked_acc.append(accuracy)

    parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
    MEU_ranked_parity.append(parity)

  elif train_full_df_MEU.at[row_num, 'PINCP'] == 0:
    train_full_df_MEU.at[row_num,'PINCP'] = 1
    accuracy,test_datapoints, test_prediction = logistic_regression(train_full_df_MEU, predictors, target_var, 'income')

    MEU_ranked_flip.append(row_num)
    MEU_ranked_acc.append(accuracy)

    parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
    MEU_ranked_parity.append(parity)




In [None]:
#Convert each of the tuples into lists
ranked_indices_list = list(sorted_index_list)
ranked_eu_list = list(MEU_ranked_parity)

#Get the top 500 points in the list
top_index_points = ranked_indices_list[:500]
top_eu_list = ranked_eu_list[:500]

<h4>Random Flipping<h4>

In [None]:
#Create a new dataframe with the original data for the random flipping
train_demo_df_random = pd.DataFrame(x_train, columns = predictors)
train_outcome_df_random = pd.DataFrame(y_train, columns = ['income'])
train_full_df_random = pd.concat([train_demo_df, train_outcome_df], axis=1)

In [None]:
#Randomized flipping
#Budget of 50
import random

random_list_flip = []
random_list_acc = []
random_list_parity = []

col_name = 'PINCP'
budget = 332900

#Loops through the budget and will randomly change rows in the dataframe based on the random library
#After changing the row it will calculate the accuracy and parity and then change the value back to its original value
#Follows the same flipping and calculations as before but this time it is with random rows rather than systematic flipping
for iter in range(budget):
    random_row = random.choice(train_full_df_random.index)
    if train_full_df_random.at[random_row,col_name] == 1:
        train_full_df_random.at[random_row,col_name] = 0

        accuracy, test_datapoints, test_prediction = logistic_regression(train_full_df_random, predictors, target_var,'income')

        random_list_flip.append(iter)
        random_list_acc.append(accuracy)

        ##################################################################################
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male','income')
        random_list_parity.append(parity)

        #train_full_df.at[random_row,col_name] = 1

    elif train_full_df_random.at[random_row,col_name] == 0:
        train_full_df_random.at[random_row,col_name] = 1

        accuracy, test_datapoints, test_prediction = logistic_regression(train_full_df_random, predictors, target_var,'income')
        random_list_flip.append(iter)
        random_list_acc.append(accuracy)

        ##################################################################################
        parity = s_parity(test_datapoints, predictors, test_prediction, 'SEX_male', 'income')
        random_list_parity.append(parity)

        #train_full_df.at[random_row,col_name] = 2




In [None]:
#Getting the top 150 accuracy values
top_ranked_acc = ranked_acc[:332900]
top_ranked_noABS_index = ranked_flip[:332900]

<h4>Graphing Solutions<h4>

In [None]:
#Plotting the random flips dataset and the systematic flips
#Systematic flips will have 800
#Random will have 150

#Plot the first dataset which is the ranked data list with 800 flips
#list_parity.sort(reverse=True)
x=range(len(list_parity))
plt.plot(x,list_parity,label='Ranked List',color='blue')

#Plot the second dataset which is the random list of 150 flips
x2 = range(len(random_list_parity))
plt.plot(x2,random_list_parity,label='Random List',color='red')

#Entropy based flips
x3 = range(len(ranked_parity))
plt.plot(x3,ranked_parity,label='Uncertainty Reduction',color='black')

#MEU based flips
x4 = range(len(MEU_ranked_parity))
plt.plot(x4,MEU_ranked_parity,label='MEU',color='brown')

#Add the labels
plt.xlabel('Flips')
plt.ylabel('Parity')
plt.title('Change in parity for the Random and Ranked list')

#Add a legend
plt.legend()

#Show the plot
plt.show()