In [2]:
# import pandas and statistics

import pandas as pd
import statistics as st

In [3]:
# read in mapping codes
# male = 1, female = 2, pilot = 1, non-pilot = 0

mapping_codes = pd.read_csv("Mapping Codes.csv")
mapping_codes

Unnamed: 0,Category,Value,Code
0,Gender,Male,1
1,Gender,Female,2
2,Pilot,Yes,1
3,Pilot,No,0
4,Race,White,1
5,Race,Black or African American,2
6,Race,American Indian or Alaska Native,3
7,Race,Asian,4
8,Race,Native Hawaiian or Pacific Islander,5
9,Race,Other,6


In [4]:
# read in pilot and non-pilot data

data = pd.read_csv("Pilot Non-Pilot SUMMARY Fairness.csv")
data

Unnamed: 0,IS_PILOT,AGE,Gender,Race,PSS,JSS,MFI,GF,PF,RA,RM,MF
0,1,21,1,5,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1
1,1,19,2,1,0.410714,0.5,0.568421,0.5,0.933333,0.8,0.45,0.25
2,1,21,1,1,0.357143,0.25,0.54,0.55,0.35,0.6,0.4,0.8
3,1,19,1,1,0.160714,0.0,0.5,0.55,0.45,0.45,0.5,0.55
4,1,18,2,1,0.392857,0.3,0.42,0.3,0.4,0.6,0.5,0.3
5,1,20,1,1,0.428571,0.3,0.59,0.6,0.7,0.5,0.6,0.55
6,1,19,2,1,0.375,0.3,0.547368,0.533333,0.5,0.65,0.5,0.55
7,1,18,2,1,0.464286,0.5,0.55,0.55,0.7,0.55,0.55,0.4
8,1,20,2,1,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65
9,1,18,1,1,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45


In [5]:
# drop rows with NaN values

data = data.dropna()
data

Unnamed: 0,IS_PILOT,AGE,Gender,Race,PSS,JSS,MFI,GF,PF,RA,RM,MF
0,1,21,1,5,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1
1,1,19,2,1,0.410714,0.5,0.568421,0.5,0.933333,0.8,0.45,0.25
2,1,21,1,1,0.357143,0.25,0.54,0.55,0.35,0.6,0.4,0.8
3,1,19,1,1,0.160714,0.0,0.5,0.55,0.45,0.45,0.5,0.55
4,1,18,2,1,0.392857,0.3,0.42,0.3,0.4,0.6,0.5,0.3
5,1,20,1,1,0.428571,0.3,0.59,0.6,0.7,0.5,0.6,0.55
6,1,19,2,1,0.375,0.3,0.547368,0.533333,0.5,0.65,0.5,0.55
7,1,18,2,1,0.464286,0.5,0.55,0.55,0.7,0.55,0.55,0.4
8,1,20,2,1,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65
9,1,18,1,1,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45


In [6]:
# rename two columns; AGE to age and IS_PILOT to Pilot

data = data.rename(columns = {"AGE ": "Age", "IS_PILOT": "Pilot"})

In [7]:
# relabel 1 as male and 2 as female

data.loc[data["Gender"] == 1, "Gender"] = "male"
data.loc[data["Gender"] == 2, "Gender"] = "female"

In [8]:
# relabel 1 as yes and 0 as no

data.loc[data["Pilot"] == 1, "Pilot"] = "yes"
data.loc[data["Pilot"] == 0, "Pilot"] = "no"

In [9]:
# filter out rows where Gender is not either male or female and Pilot is not yes or no

data = data.loc[data["Gender"].isin(["male", "female"])]
data = data.loc[data["Pilot"].isin(["yes", "no"])]

In [10]:
# reset index to start from 1

data = data.reset_index(drop = True)
data.index += 1

In [11]:
# filter out race
# only interested in age demographic information for reducing bias

final_data = data[["Pilot", "Age", "Gender", "PSS", "JSS", "MFI", "GF", "PF", "RA", "RM", "MF"]]
final_data

Unnamed: 0,Pilot,Age,Gender,PSS,JSS,MFI,GF,PF,RA,RM,MF
1,yes,21,male,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1
2,yes,19,female,0.410714,0.5,0.568421,0.5,0.933333,0.8,0.45,0.25
3,yes,21,male,0.357143,0.25,0.54,0.55,0.35,0.6,0.4,0.8
4,yes,19,male,0.160714,0.0,0.5,0.55,0.45,0.45,0.5,0.55
5,yes,18,female,0.392857,0.3,0.42,0.3,0.4,0.6,0.5,0.3
6,yes,20,male,0.428571,0.3,0.59,0.6,0.7,0.5,0.6,0.55
7,yes,19,female,0.375,0.3,0.547368,0.533333,0.5,0.65,0.5,0.55
8,yes,18,female,0.464286,0.5,0.55,0.55,0.7,0.55,0.55,0.4
9,yes,20,female,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65
10,yes,18,male,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45


In [12]:
final_data.to_csv("preprocessed_pilot_non-pilot_data.csv", index = False)