In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [12]:
def make_primary_and_addl_dataset(
    input_path,
    output_path_primary_data,
    output_path_addl_data,
    output_path_primary_metadata,
    output_path_addl_metadata,
    columns_raw):
    
    df = pd.read_csv(input_path,header=None,names=columns_raw)
    
    # extract features
    features = df.iloc[:,1:-1]

    # scale features using standard scaler
    scaler = StandardScaler()
    scaler.fit(features)
    transformed_features = scaler.transform(features)

    # replace features in original dataframe with transformed features
    df.iloc[:,1:-1] = transformed_features

    # convert continuous GPA labels to binary: 0 if GPA < 3, 1 if GPA >= 3
    # Make a new column called GPA_class
    df.loc[:,'GPA_class'] = df['GPA'].apply(lambda x: float(int(x>=3.0)))

    # Drop original GPA column
    df = df.drop(columns=['GPA'])

    # Primary dataset has no gender label
    df_primary = df.iloc[:,1:]

    # Sample 80% of df for addl dataset (still has gender label)
    df_addl = df.sample(frac=0.8,replace=False,random_state=42)

     # Make gender column two columns: "M" and "F" for men and women, both binary valued
    df_addl['M'] = df_addl.loc[:,'Gender']
    df_addl['F'] = df_addl.loc[:,'Gender'].apply(lambda x: 0 if x==1 else 1)

    # reorder columns so that sensitive attributes M and F are first
    cols = df_addl.columns.tolist()
    cols_reorder = cols[-2:] + cols[0:-2]
    df_addl = df_addl[cols_reorder]

    
    # Save final dataframes
    df_primary.to_csv(output_path_primary_data,index=False,header=False)
    print(f"Saved primary data file to: {output_path_primary_data}\n")
    
    df_addl.to_csv(output_path_addl_data,index=False,header=False)
    print(f"Saved addl data file to: {output_path_addl_data}\n")

    
    # Save metadata json files
    
    primary_metadata_dict = {
        "regime":"supervised",
        "sub_regime":"classification",
        "all_col_names":list(df_primary.columns),
        "label_col_names":["GPA_class"],
        "sensitive_col_names":[]
    }
    
    addl_metadata_dict = {
        "regime":"supervised",
        "sub_regime":"classification",
        "all_col_names":list(df_addl.columns),
        "label_col_names":["GPA_class"],
        "sensitive_col_names":["M","F"]
    }
    
    with open(output_path_primary_metadata,'w') as outfile:
        json.dump(primary_metadata_dict,outfile,indent=2)
    print(f"Saved primary metadata file to: {output_path_primary_metadata}\n")
    
    with open(output_path_addl_metadata,'w') as outfile:
        json.dump(addl_metadata_dict,outfile,indent=2)
    print(f"Saved addl metadata file to: {output_path_addl_metadata}\n")

    return 

In [13]:
f = "./gpa_data_download/data.csv"
columns_raw = ["Gender", "Physics", "Biology", 
    "History", "Second_Language", "Geography", 
    "Literature", "Portuguese_and_Essay", 
    "Math", "Chemistry", "GPA"]

make_primary_and_addl_dataset(
    input_path=f,
    output_path_primary_data="gpa_classification_primary_dataset.csv",
    output_path_addl_data="gpa_classification_addl_dataset.csv",
    output_path_primary_metadata="primary_metadata_classification.json",
    output_path_addl_metadata="addl_metadata_classification.json",
    columns_raw=columns_raw
)

Saved primary data file to: gpa_classification_primary_dataset.csv

Saved addl data file to: gpa_classification_addl_dataset.csv

Saved primary metadata file to: primary_metadata_classification.json

Saved addl metadata file to: addl_metadata_classification.json



In [15]:
!cat primary_metadata_classification.json

{
  "regime": "supervised",
  "sub_regime": "classification",
  "all_col_names": [
    "Physics",
    "Biology",
    "History",
    "Second_Language",
    "Geography",
    "Literature",
    "Portuguese_and_Essay",
    "Math",
    "Chemistry",
    "GPA_class"
  ],
  "label_col_names": [
    "GPA_class"
  ],
  "sensitive_col_names": []
}

In [16]:
!cat addl_metadata_classification.json

{
  "regime": "supervised",
  "sub_regime": "classification",
  "all_col_names": [
    "M",
    "F",
    "Gender",
    "Physics",
    "Biology",
    "History",
    "Second_Language",
    "Geography",
    "Literature",
    "Portuguese_and_Essay",
    "Math",
    "Chemistry",
    "GPA_class"
  ],
  "label_col_names": [
    "GPA_class"
  ],
  "sensitive_col_names": [
    "M",
    "F"
  ]
}