In [2]:
import pandas as pd
import random
import pyspark
import re
import numpy as np

In [3]:
%env JAVA_HOME = /usr/lib/jvm/java-8-openjdk-amd64

env: JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64


In [None]:
sc = pyspark.SparkContext()

In [None]:
# Load sample data, which has 10,000 and 50,000 rows
application = sc.textFile ("application_data_sample.csv")
previous_application = sc.textFile ("previous_application_sample.csv")

In [None]:
# Rows are lists 
cleaned_application = application.flatMap(lambda x: [x.split(',')])
cleaned_previous = previous_application.flatMap(lambda x: [x.split(',')])

# Columns to keep 
application_keep = [0, 3, 4, 5, 7, 8, 9, 10, 13, 14, 17, 18, 25, 27, 116, 117, 118, 119]
previous_keep = [1, 16]

# Filter
filtered_application = cleaned_application.map(lambda x: [x[i] for i in application_keep])
filtered_previous = cleaned_previous.map(lambda x: [x[i] for i in previous_keep])

# Key, values with key as ID 
kv_application = filtered_application.map(lambda x: (x[0], x[1:]))
kv_previous = filtered_previous.map(lambda x: (x[0], x[1:]))

In [None]:
# Kept columns in application: 
kv_application.first()

In [None]:
# Kept columns in previous: 
kv_previous.first()

In [None]:
# Join application and previous 

joined = kv_application.join(kv_previous)
joined_no_index = joined.map(lambda x: (x[1][0] + x[1][1]))

# Filter rows that do not have 'Approved' or 'Refused'
filtered_decision = joined_no_index.filter(lambda x: (x[-1] == 'Approved' or x[-1] == 'Refused'))
# Change approved/denied to binary 1/0 
binary_decision = filtered_decision.map(lambda x: (x[:-1] + ['1'] if x[-1] == 'Approved' else x[:-1] + ['0']))

In [None]:
binary_decision.first()

In [None]:
'''
Categories to check bias for: 
- Genre 
- Ownership 
- Education type
- Family status 
- Age 
- Technology 
'''

In [None]:
import matplotlib.pyplot as plt

incomes = binary_decision.map(lambda x: float(x[3])).collect()

plt.figure(figsize=(10, 6))
plt.hist(incomes, bins=30, edgecolor='black')
plt.title('Income Distribution')
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
'''
To implement conditional demographic parity, add a column for income bracket.
Above, I looked at the income distribution to decide on buckets to use. 
1 = 0-100k
2 = 100-200k
3 = 200k 
'''

def income_cat(x):
    if float(x[3]) < 100000: 
        return list(x) + ['1']
    elif float(x[3]) < 200000: 
        return list(x) + ['2']
    else: 
        return list(x) + ['3']

# Added the category in the last column 
income_category = binary_decision.map(lambda x: income_cat(x))

In [None]:
income_cat.first()

In [None]:
# Gender analysis

# Safe division 
def safe_div(n, d):
    return n / d if d != 0 else 0

# Number of accepted loans per gender 
gender_key = income_category.map(lambda x: (x[0], x[-2]))
gender_accepted = gender_key.reduceByKey(lambda x, y: int(x) + int(y))

# Total loans request per gender 
gender_total_key = income_category.map(lambda x: (x[0], 1))
gender_total = gender_total_key.reduceByKey(lambda x, y: x + y)

# Insert in python list 
gender_accepted_list = gender_accepted.take(2)
gender_total_list = gender_total.take(2)

# Initialize variables 
accepted_men = 0
total_men = 0
accepted_women = 0
total_women = 0

for gender, number in gender_accepted_list:
    if gender == 'F':
        accepted_women = number
    elif gender == 'M': 
        accepted_men = number
        
for gender, number in gender_total_list:
    if gender == 'F':
        total_women = number
    elif gender == 'M': 
        total_men = number
        
total_accepted = accepted_women + accepted_men

# Women and men approval rates 
women_AR = safe_div(accepted_women, total_women)
men_AR = safe_div(accepted_men, total_men)

# Demographic party difference 
DPD = abs(women_AR - men_AR)

# Disparate Impact Ratio 
DIR = abs(women_AR / men_AR)

# Balance for positive class 
accepted_women_ratio = accepted_women / total_accepted
accepted_men_ratio = accepted_men / total_accepted
BPC = abs(accepted_women_ratio - accepted_men_ratio)

# Number of accepted loans per gender and income
gender_income_key = income_category.map(lambda x: ((x[0], x[-1]), x[-2]))
gender_income_accepted = gender_income_key.reduceByKey(lambda x, y: int(x) + int(y))

# Total number of accepted loans per gender and income
gender_income_total_key = income_category.map(lambda x: ((x[0], x[-1]), 1))
gender_income_total = gender_income_total_key.reduceByKey(lambda x, y: x + y)

# Insert in python list 
gender_income_accepted_list = gender_income_total_key.collect()
gender_income_total_list = gender_income_total.collect()

# Initialize variables 
accepted_men_low = 0
total_men_low = 0

accepted_men_mid = 0
total_men_mid = 0

accepted_men_high = 0
total_men_high = 0

accepted_women_low = 0
total_women_low = 0

accepted_women_mid = 0
total_women_mid = 0

accepted_women_high = 0
total_women_high = 0

for tup, number in gender_income_accepted_list:
    if tup[0] == 'F' and tup[1] == '1':
        accepted_women_low = number
    elif tup[0] == 'F' and tup[1] == '2':
        accepted_women_mid = number
    elif tup[0] == 'F' and tup[1] == '3':
        accepted_women_high = number 
        
    elif tup[0] == 'M' and tup[1] == '1':
        accepted_men_low = number
    elif tup[0] == 'M' and tup[1] == '2':
        accepted_men_mid = number
    elif tup[0] == 'M' and tup[1] == '3':
        accepted_men_high = number
        
for tup, number in gender_income_total_list:
    if tup[0] == 'F' and tup[1] == '1':
        total_women_low = number
    elif tup[0] == 'F' and tup[1] == '2':
        total_women_mid = number
    elif tup[0] == 'F' and tup[1] == '3':
        total_women_high = number 
        
    elif tup[0] == 'M' and tup[1] == '1':
        total_men_low = number
    elif tup[0] == 'M' and tup[1] == '2':
        total_men_mid = number
    elif tup[0] == 'M' and tup[1] == '3':
        total_men_high = number
        
# Women and men approval rates in function of income 
women_AR_low = safe_div(accepted_women_low, total_women_low)
women_AR_mid = safe_div(accepted_women_mid, total_women_mid)
women_AR_high = safe_div(accepted_women_high, total_women_high)

men_AR_low = safe_div(accepted_men_low, total_men_low)
men_AR_mid = safe_div(accepted_men_mid, total_men_mid)
men_AR_high = safe_div(accepted_men_high, total_men_high)

# Conditional demographic parity
CDP_low = abs(women_AR_low - men_AR_low)
CDP_mid = abs(women_AR_mid - men_AR_mid)
CDP_high = abs(women_AR_high - men_AR_high)

CDP_list = [CDP_low, CDP_mid, CDP_high]

# Size of economical groups without gender 
group_low = total_women_low + total_men_low
group_mid = total_women_mid + total_men_mid
group_high = total_women_high + total_men_high

total_size = group_low + group_mid + group_high
group_list = [group_low, group_mid, group_high]

# Explained disparity 
ED = 0
for cdp, size in zip(CDP_list, group_list):
    ED += cdp * size / total_size

# Unexplained disparity 
UD = DPD - ED 