In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
f = "./gpa_data_download/data.csv"
columns_raw = ["Gender", "Physics", "Biology", 
    "History", "Second_Language", "Geography", 
    "Literature", "Portuguese_and_Essay", 
    "Math", "Chemistry", "GPA"]
df = pd.read_csv(f,header=None,names=columns_raw)
df

Unnamed: 0,Gender,Physics,Biology,History,Second_Language,Geography,Literature,Portuguese_and_Essay,Math,Chemistry,GPA
0,0,622.60,491.56,439.93,707.64,663.65,557.09,711.37,731.31,509.80,1.33333
1,1,538.00,490.58,406.59,529.05,532.28,447.23,527.58,379.14,488.64,2.98333
2,1,455.18,440.00,570.86,417.54,453.53,425.87,475.63,476.11,407.15,1.97333
3,0,756.91,679.62,531.28,583.63,534.42,521.40,592.41,783.76,588.26,2.53333
4,1,584.54,649.84,637.43,609.06,670.46,515.38,572.52,581.25,529.04,1.58667
...,...,...,...,...,...,...,...,...,...,...,...
43298,1,519.55,622.20,660.90,543.48,643.05,579.90,584.80,581.25,573.92,2.76333
43299,1,816.39,851.95,732.39,621.63,810.68,666.79,705.22,781.01,831.76,3.81667
43300,0,798.75,817.58,731.98,648.42,751.30,648.67,662.05,773.15,835.25,3.75000
43301,0,527.66,443.82,545.88,624.18,420.25,676.80,583.41,395.46,509.80,2.50000


In [3]:
def make_seldonian_dataset(input_path,output_path_data,output_path_metadata,columns_raw):
    
    df = pd.read_csv(input_path,header=None,names=columns_raw)
    
    # extract features
    features = df.iloc[:,1:-1]
    
    # scale features using standard scaler
    scaler = StandardScaler()
    scaler.fit(features)
    transformed_features = scaler.transform(features)
    
    # replace features in original dataframe with transformed features
    df.iloc[:,1:-1] = transformed_features
    
    # convert continuous GPA labels to binary: 0 if GPA < 3, 1 if GPA >= 3
    # Make a new column called GPA_class
    df.loc[:,'GPA_class'] = df['GPA'].apply(lambda x: float(int(x>=3.0)))
    
    # Make gender column two columns: "M" and "F" for men and women, both binary valued
    df['M'] = df.loc[:,'Gender']
    df['F'] = df.loc[:,'Gender'].apply(lambda x: 0 if x==1 else 1)
    
    # Drop columns we won't use anymore
    df = df.drop(columns=['GPA','Gender'])
    
    # reorder columns so that sensitive features M and F are first
    cols = df.columns.tolist()
    cols_reorder = cols[-2:] + cols[0:-2]
    outdf = df[cols_reorder]
    
    # Save final dataframe
    outdf.to_csv(output_path_data,index=False,header=False)
    print(f"Saved data file to: {output_path_data}\n")
    
    # Save metadata json file
    metadata_dict = {
        "regime":"supervised",
        "sub_regime":"classification",
        "columns":list(outdf.columns),
        "label_column":"GPA_class",
        "sensitive_columns":["M","F"]
    }
    
    with open(output_path_metadata,'w') as outfile:
        json.dump(metadata_dict,outfile,indent=2)
    print(f"Saved metadata file to: {output_path_metadata}\n")
    print("Here is what the final dataframe looks like:")
    return outdf

In [4]:
make_seldonian_dataset(input_path=f,
    output_path_data="gpa_classification_dataset.csv",
    output_path_metadata="metadata_classification.json",
    columns_raw=columns_raw)

Saved data file to: gpa_classification_dataset.csv

Saved metadata file to: metadata_classification.json

Here is what the final dataframe looks like:


Unnamed: 0,M,F,Physics,Biology,History,Second_Language,Geography,Literature,Portuguese_and_Essay,Math,Chemistry,GPA_class
0,0,1,0.403617,-0.760038,-1.495508,1.538450,0.982363,-0.282169,1.839716,1.326415,-0.551947,0.0
1,1,0,-0.331064,-0.769699,-1.849386,-0.517353,-0.465143,-1.464796,-0.269207,-1.744443,-0.740591,0.0
2,1,0,-1.050287,-1.268307,-0.105791,-1.800977,-1.332853,-1.694733,-0.865314,-0.898882,-1.467084,0.0
3,0,1,1.569989,1.093821,-0.525901,0.110934,-0.441563,-0.666367,0.474694,1.783769,0.147533,0.0
4,1,0,0.073098,0.800255,0.600797,0.403666,1.057400,-0.731171,0.246463,0.017919,-0.380420,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
43298,1,0,-0.491287,0.527786,0.849912,-0.351245,0.755381,-0.036623,0.387372,0.017919,0.019690,0.0
43299,1,0,2.086524,2.792616,1.608722,0.548363,2.602420,0.898735,1.769147,1.759789,2.318364,1.0
43300,0,1,1.933335,2.453804,1.604370,0.856751,1.948139,0.703676,1.273787,1.691252,2.349477,1.0
43301,0,1,-0.420858,-1.230650,-0.370933,0.577717,-1.699551,1.006491,0.371422,-1.602135,-0.551947,0.0
