In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [13]:
def make_primary_and_addl_dataset(
    input_path,
    output_path_primary_data,
    output_path_addl_data,
    output_path_primary_metadata,
    output_path_addl_metadata,
    columns_raw):
    
    df = pd.read_csv(input_path,header=None,names=columns_raw)
    
    # extract features
    features = df.iloc[:,1:-1]

    # scale features using standard scaler
    scaler = StandardScaler()
    scaler.fit(features)
    transformed_features = scaler.transform(features)

    # replace features in original dataframe with transformed features
    df.iloc[:,1:-1] = transformed_features

    # convert continuous GPA labels to binary: 0 if GPA < 3, 1 if GPA >= 3
    # Make a new column called GPA_class
    df.loc[:,'GPA_class'] = df['GPA'].apply(lambda x: float(int(x>=3.0)))

    # Drop original GPA column
    df = df.drop(columns=['GPA'])

    # Primary dataset has no gender label
    df_primary = df.iloc[:,1:]

    # Sample 30% of df for addl dataset (still has gender label)
    df_addl = df.sample(frac=0.3,replace=False,random_state=42)

     # Make gender column two columns: "M" and "F" for men and women, both binary valued
    df_addl['M'] = df_addl.loc[:,'Gender']
    df_addl['F'] = df_addl.loc[:,'Gender'].apply(lambda x: 0 if x==1 else 1)

    # Drop gender column from df_addl
    df_addl = df_addl.drop(columns=["Gender"])
    # reorder columns so that sensitive attributes M and F are first
    cols = df_addl.columns.tolist()
    cols_reorder = cols[-2:] + cols[0:-2]
    df_addl = df_addl[cols_reorder]
        
    
    # Save final dataframes
    df_primary.to_csv(output_path_primary_data,index=False,header=False)
    print(f"Saved primary data file to: {output_path_primary_data}\n")
    
    df_addl.to_csv(output_path_addl_data,index=False,header=False)
    print(f"Saved addl data file to: {output_path_addl_data}\n")

    # Save metadata json files
    
    primary_metadata_dict = {
        "regime":"supervised_learning",
        "sub_regime":"classification",
        "all_col_names":list(df_primary.columns),
        "label_col_names":["GPA_class"],
        "sensitive_col_names":[]
    }
    
    addl_metadata_dict = {
        "regime":"supervised_learning",
        "sub_regime":"classification",
        "all_col_names":list(df_addl.columns),
        "label_col_names":["GPA_class"],
        "sensitive_col_names":["M","F"]
    }
    
    with open(output_path_primary_metadata,'w') as outfile:
        json.dump(primary_metadata_dict,outfile,indent=2)
    print(f"Saved primary metadata file to: {output_path_primary_metadata}\n")
    
    with open(output_path_addl_metadata,'w') as outfile:
        json.dump(addl_metadata_dict,outfile,indent=2)
    print(f"Saved addl metadata file to: {output_path_addl_metadata}\n")

    return 

In [14]:
f = "~/beri/code/notebooks/gpa_data_download/data.csv"
columns_raw = ["Gender", "Physics", "Biology", 
    "History", "Second_Language", "Geography", 
    "Literature", "Portuguese_and_Essay", 
    "Math", "Chemistry", "GPA"]

make_primary_and_addl_dataset(
    input_path=f,
    output_path_primary_data="gpa_classification_primary_dataset.csv",
    output_path_addl_data="gpa_classification_addl_dataset.csv",
    output_path_primary_metadata="primary_metadata_classification.json",
    output_path_addl_metadata="addl_metadata_classification.json",
    columns_raw=columns_raw
)

Saved primary data file to: gpa_classification_primary_dataset.csv

Saved addl data file to: gpa_classification_addl_dataset.csv

Saved primary metadata file to: primary_metadata_classification.json

Saved addl metadata file to: addl_metadata_classification.json



In [6]:
!cat primary_metadata_classification.json

{
  "regime": "supervised_learning",
  "sub_regime": "classification",
  "all_col_names": [
    "Physics",
    "Biology",
    "History",
    "Second_Language",
    "Geography",
    "Literature",
    "Portuguese_and_Essay",
    "Math",
    "Chemistry",
    "GPA_class"
  ],
  "label_col_names": [
    "GPA_class"
  ],
  "sensitive_col_names": []
}

In [7]:
!cat addl_metadata_classification.json

{
  "regime": "supervised_learning",
  "sub_regime": "classification",
  "all_col_names": [
    "M",
    "F",
    "Gender",
    "Physics",
    "Biology",
    "History",
    "Second_Language",
    "Geography",
    "Literature",
    "Portuguese_and_Essay",
    "Math",
    "Chemistry",
    "GPA_class"
  ],
  "label_col_names": [
    "GPA_class"
  ],
  "sensitive_col_names": [
    "M",
    "F"
  ]
}

In [8]:
!head gpa_classification_addl_dataset.csv

1,0,1,-1.139908112595362,-1.0733193284219045,-0.9471784674922604,1.2934891718699535,-1.1671344974533673,-0.3605373141363532,0.5599504682061117,0.2087959121959728,0.32271489099911604,0.0
0,1,0,-0.33792444634589314,1.5224381743711217,-1.179735672410228,-0.08337684280818243,0.48068901147110454,0.6573871962429912,-0.647180906333141,-0.3091610673703959,-0.24098787782837444,1.0
1,0,1,-0.6701810253464856,-0.9392531654339613,-0.45202586094624453,-0.16694884570562624,0.5837124137341861,0.9281232520386816,-0.7431088368573631,0.4831212754477907,-0.8032642281359781,1.0
0,1,0,0.09046596249099081,-0.26507780611444515,-0.9879370100246654,0.11093381957595705,-1.7741461798783027,-0.6663667910809083,-0.22904528572277577,-0.9592232361696014,-0.2397397616234732,1.0
1,0,1,2.306059820316946,1.2246338520280509,0.475443265741662,-0.2606507883482147,0.6916941615607307,0.37416391083604755,0.8643724773936243,1.2994703062054247,1.438887382809994,1.0
1,0,1,0.09046596249099081,-0.4916101903396608,0.16731292987