<a href="https://colab.research.google.com/github/s-thandri/labelflipping/blob/main/acs_income.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Import the libraries and build the parity and LR function <h1>

In [74]:
#Import all of the necessary libraries in
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from scipy.io import arff
warnings.filterwarnings('ignore')

In [75]:
#This function is being used to calculate the statistical parity of the data set
#test_data: The original data frame containing the test data
#predictors: List of the all the column names AKA IV's
#prediction_df: Contains the predicitons made by the DT model
#sensitive_attr: Name of the sensitive attr
#sensitive_attr_cutoff: Cut off value for sensitive attr
#concat_col: Name of column I am creating new dataframe on

def s_parity(test_data, predictors, prediction_df, sensitive_attr, concat_col):
    #Creating a new DF that contains all the datapoints from the test data and the predictions made from LR model
    #Concat_col: outcome
    test_demo_df = pd.DataFrame(test_data, columns = predictors)
    predicted_df = pd.DataFrame(prediction_df, columns = [concat_col])
    concat_df = pd.concat([test_demo_df,predicted_df], axis=1)

    #Get the two groups of people totals
    total_unpriv = (len(concat_df[concat_df[sensitive_attr]==0]))
    total_priv = (len(concat_df[concat_df[sensitive_attr]==1]))

    #Number of people accepted
    total_credit_unpriv = len(concat_df[(concat_df[concat_col] == 1) & (concat_df[sensitive_attr] == 0)])
    total_credit_priv = len(concat_df[(concat_df[concat_col] == 1) & (concat_df[sensitive_attr] == 1)])

    #Percentage of approved people
    p_unpriv = total_credit_unpriv/total_unpriv
    p_priv = total_credit_priv/total_priv


    #Calculate the parity
    parity = p_priv - p_unpriv


    return parity

In [76]:
#Make changes to make more efficient
#Function used to run the DT model
#train_dataset: Training dataset to train the model
#independent_var: Column names
#dependent_var: Prediction column name
#concat_col: Name of column creating new DF on
def logistic_regression(train_dataset, independent_var, dependent_var, concat_col):
        #Split the data up into train and test values and then run the DT model
        #These steps aren't neccessary to consistenly do over and over again
        x = train_dataset[independent_var].values
        y = train_dataset[dependent_var].values
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=310)
        clf = LogisticRegression(class_weight=None, max_iter=100)
        log = clf.fit(x_train,y_train)
        prediction = log.predict(x_test)

        #Printing the Overall accuracy of the model after one run
        #F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
        #file.write(f'\nAccuracy of the model on Testing Sample Data: {F1_Score}')

        #Prints out the average across all ten run throughs
        #Accuracy_Values=cross_val_score(log, x , y, cv=10, scoring='f1_weighted')

        accuracy = accuracy_score(y_test,prediction)*100

        #After running the model I return a df with the datapoints and the labels
        test_demo_df = pd.DataFrame(x_test, columns = independent_var)
        #Concat_col: credit_risk_12
        predicted_df = pd.DataFrame(prediction, columns = [concat_col])

        return accuracy, test_demo_df, predicted_df

<h1>Load the data into the dataframe <h1>

In [109]:
#Load all the data into the acs_dataframe
acs_dataframe = pd.read_csv('acs_income.csv', index_col=None, sep=',')

In [78]:
acs_dataframe

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST,PINCP
0,18.00,1.00,18.00,5.00,4720.00,13.00,17.00,21.00,2.00,2.00,1.00,1600.00
1,53.00,5.00,17.00,5.00,3605.00,18.00,16.00,40.00,1.00,1.00,1.00,10000.00
2,41.00,1.00,16.00,5.00,7330.00,1.00,17.00,40.00,1.00,1.00,1.00,24000.00
3,18.00,6.00,18.00,5.00,2722.00,1.00,17.00,2.00,2.00,1.00,1.00,180.00
4,21.00,5.00,19.00,5.00,3870.00,12.00,17.00,50.00,1.00,1.00,1.00,29000.00
...,...,...,...,...,...,...,...,...,...,...,...,...
1664495,39.00,6.00,16.00,5.00,6260.00,72.00,0.00,20.00,1.00,1.00,72.00,9600.00
1664496,38.00,6.00,14.00,5.00,4251.00,72.00,0.00,32.00,1.00,8.00,72.00,2400.00
1664497,37.00,1.00,19.00,3.00,7750.00,17.00,13.00,40.00,2.00,9.00,72.00,19700.00
1664498,47.00,1.00,16.00,1.00,8990.00,72.00,1.00,40.00,1.00,8.00,72.00,18700.00


<h3> Conducting basic information gathering on a dataset <h3>
<h4> Important information about the dataset regarding columns <h4>
<ul>
<li>AGEP: Age</li>
<li>COW: Class of Worker</li>
<li>SCHL: Educational Attainment</li>
<li>MAR: Marital Status</li>
<li>OCCP: Occupation</li>
<li>POBP: Place of Birth</li>
<li>RELP: Relationship to Householders</li>
<li>WKHP: Usual Hours worked per week</li>
<li>SEX: Sex</li>
<li>RAC1P: Race</li>
<li>ST: State Codes</li>
<li>PINCP: Total Annual Income</li>
</ul>



In [108]:
#Fixing the noation 
pd.set_option('display.float_format', lambda x: '%.2f' % x)  # Set decimal precision


In [80]:
#Describe the dataset
acs_dataframe.describe()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST,PINCP
count,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0,1664500.0
mean,43.41,2.08,18.62,2.52,4180.52,65.82,2.24,38.33,1.48,1.87,28.13,56663.86
std,15.3,1.83,3.3,1.8,2658.72,93.06,4.39,13.08,0.5,2.08,16.32,73067.45
min,17.0,1.0,1.0,1.0,10.0,1.0,0.0,1.0,1.0,1.0,1.0,104.0
25%,30.0,1.0,16.0,1.0,2205.0,18.0,0.0,35.0,1.0,1.0,12.0,20000.0
50%,43.0,1.0,19.0,1.0,4200.0,36.0,1.0,40.0,1.0,1.0,28.0,39000.0
75%,56.0,3.0,21.0,5.0,5740.0,48.0,2.0,44.0,2.0,1.0,42.0,68000.0
max,96.0,8.0,24.0,5.0,9830.0,554.0,17.0,99.0,2.0,9.0,72.0,1423000.0


In [81]:
#Understand the spread of the OCCUPATIONS
#0010-0440	11-0000	Management Occupations: 170997
#0500-0960	13-0000	Business and Financial Operations Occupations: 91842
#1005-1240	15-0000	Computer and mathematical occupations: 50817
#1305-1560	17-0000	Architecture and Engineering Occupations: 31718
#1600-1980	19-0000	Life, Physical, and Social Science Occupations: 16529
#2001-2970	21-0000 - 27-0000	Education, Legal, Community Service, Arts, and Media Occupations: 193762
#3000-3550	29-0000	Healthcare Practitioners and Technical Occupations: 100986
#3601-4655	31-0000 - 39-0000	Service Occupations: 283912
#4700-5940	41-0000 - 43-0000	Sales and Office Occupations: 358340
#6005-7640	45-0000 - 49-0000	Natural Resources, Construction, and Maintenance Occupations: 143613
#7700-9760	51-0000 - 53-0000	Production, Transportation, and Material Moving Occupations: 216280
#9800-9920                      Military Specific Occupations: 5704
((acs_dataframe['OCCP'] >= 9800) & (acs_dataframe['OCCP'] <= 9920)).sum()

5704

<h4>Level of Education Grouped<h4>
<ul>
<li>Grade School (No Diploma): 109,882</li>
<li>High School Diploma/GED: 400,706 </li>
<li>Some College (2 Years at Most): 531,044 </li>
<li>Bachelor's Degree: 366,380</li>
<li>Master's Degree: 160,594 </li>
<li>Professional Degree: 41,426</li>
<li>Doctorate Or Equivalent: 27530</li>
</ul>

<h2>Group Info in columns to ranges so there is less unique values for making Dummies <h2>

In [113]:
#Group the SCHL columns into a single values
#1: Grade School (No Diploman)
#2: High School Diploma/GED
#3: Some College (2 Years at Most)
#4: Bachelor's Degree
#5: Master's Degree
#6: Professional Degree
#7: Doctorate Or Equivalent
column_dtype = acs_dataframe.dtypes['SCHL']
value = acs_dataframe.at[17, 'SCHL']

for index, row in acs_dataframe.iterrows():
    if isinstance(acs_dataframe.at[index,'SCHL'], str):
        print('String')
        break
    elif isinstance(acs_dataframe.at[index,'SCHL'], int):
        acs_dataframe.loc[(acs_dataframe['SCHL'] >= 1) & (acs_dataframe['SCHL'] <= 15), 'SCHL'] = 'GS'
        acs_dataframe.loc[(acs_dataframe['SCHL'] >= 16) & (acs_dataframe['SCHL'] <= 17), 'SCHL'] = 'HSD'
        acs_dataframe.loc[(acs_dataframe['SCHL'] >= 18) & (acs_dataframe['SCHL'] <= 20), 'SCHL'] = 'SC'
        acs_dataframe.loc[(acs_dataframe['SCHL'] == 21), 'SCHL'] = 'BD'
        acs_dataframe.loc[(acs_dataframe['SCHL'] == 22), 'SCHL'] = 'MD'
        acs_dataframe.loc[(acs_dataframe['SCHL'] == 23), 'SCHL'] = 'PD'
        acs_dataframe.loc[(acs_dataframe['SCHL'] == 24), 'SCHL'] = 'DE'






In [None]:
#incomplete copy bottom cell
ranges = [(1,15),(16,17),(18,20),(21),(22),(23),(24)]
ranges = {'Group A': (1, 15), 'Group B': (16, 27), 'Group C': (18, 20), 'Group D': (21), 'Group E': (22), 'Group F': (23), 'Group G': (24)}

# Define a function to assign each value to its corresponding group based on the ranges
def assign_group(value):
    for i, (start, end) in enumerate(ranges):
        if start <= value <= end:
            return groups[i]
    return None 

acs_dataframe['group'] = acs_dataframe['SCHL'].apply(assign_group)

In [119]:
df = pd.DataFrame({'data': [1, 5, 10, 15, 20, 23]})
ranges = {'Group A': (1, 10), 'Group B': (11, 20), 'Group C': (21, 24)}
group_dict = {}

def assign_group(value):
  """Assigns a group label to a value based on the ranges dictionary."""
  if value not in group_dict:
    for group_name, group_range in ranges.items():
      if group_range[0] <= value <= group_range[1]:
        group_dict[value] = group_name
        break
  return group_dict.get(value)

df['group'] = df['data'].apply(assign_group)

print(df)

   data    group
0     1  Group A
1     5  Group A
2    10  Group A
3    15  Group B
4    20  Group B
5    23  Group C


In [115]:
acs_dataframe

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST,PINCP
0,18.00,1.00,18.00,5.00,4720.00,13.00,17.00,21.00,2.00,2.00,1.00,1600.00
1,53.00,5.00,17.00,5.00,3605.00,18.00,16.00,40.00,1.00,1.00,1.00,10000.00
2,41.00,1.00,16.00,5.00,7330.00,1.00,17.00,40.00,1.00,1.00,1.00,24000.00
3,18.00,6.00,18.00,5.00,2722.00,1.00,17.00,2.00,2.00,1.00,1.00,180.00
4,21.00,5.00,19.00,5.00,3870.00,12.00,17.00,50.00,1.00,1.00,1.00,29000.00
...,...,...,...,...,...,...,...,...,...,...,...,...
1664495,39.00,6.00,16.00,5.00,6260.00,72.00,0.00,20.00,1.00,1.00,72.00,9600.00
1664496,38.00,6.00,14.00,5.00,4251.00,72.00,0.00,32.00,1.00,8.00,72.00,2400.00
1664497,37.00,1.00,19.00,3.00,7750.00,17.00,13.00,40.00,2.00,9.00,72.00,19700.00
1664498,47.00,1.00,16.00,1.00,8990.00,72.00,1.00,40.00,1.00,8.00,72.00,18700.00
