# 2022 ML Exam

STUDENTID: mv313

# Part I: Adult Dataset 

Uncomment cell bellow if libraries need to be installed. 

In [1]:
#!pip install aif360
#!pip install fairlearn
#!pip install tensorflow

# AdultDataset 



In [90]:
import numpy as np
import pandas as pd
import tensorflow as tf
from aif360.datasets import AdultDataset
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.metrics import ClassificationMetric 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC

np.random.seed(42)


In [89]:
# cd \usr\lib\aif360\data\raw\adult

In [120]:
# Representation of a standard ML classification

dataset_orig = load_preproc_data_adult(['sex'])
priv = [{'sex': 1}]
unpriv = [{'sex': 0}]
train, test = dataset_orig.split([0.7], shuffle=True)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train.features)
y_train = train.labels.ravel()
X_test = scaler.transform(test.features) 
y_test = test.labels.ravel()
clf = LogisticRegression(solver='liblinear', C = 0.5, random_state=1)  
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
test_pred = test.copy()
test_pred.labels = predictions
acc = sum(predictions==y_test)/len(y_test)
print (acc)


0.8039991810550741


# From the start

In [133]:
ds = AdultDataset()
dataset_orig = load_preproc_data_adult(['sex'])
priv = [{'sex': 1}]
unpriv = [{'sex': 0}]
train, test = dataset_orig.split([0.7], shuffle=True)
scaler = MinMaxScaler()




In [134]:
def return_af_tradeoff(acc, eo):
    ftd = (1 - abs(eo) + acc)/2
    return ftd

In [135]:
def ret_folds(train, k = 5):
    f1, f2, f3, f4, f5 = train.split(k, shuffle = True)
    folds = [f1, f2, f3, f4, f5]
    return folds

def eval_k_fold(train, clf, rw=None,k=5):
    
    if rw: 
        train = rw.fit_transform(train)
        
    train_folds = ret_folds(train)
    averages = [0, 0, 0]
    eq_opps = []
    tradeoffs = []
    accuracies = []
    true_ys = []
    y_preds = []
    metrics = {}
    k_test = train_folds[0]
    
    for i in range(k-1, -1, -1):
        for j in range(k-1):
            if j == 0:
                X_train = scaler.fit_transform(train_folds[j].features)
                y_train = train_folds[j].labels.ravel()
                sw = train_folds[j].instance_weights
                true_ys = np.concatenate((y_train, true_ys))
            else:
                if j == i:
                    pass
                else:
                    X_train = np.concatenate((X_train, scaler.fit_transform(train_folds[j].features)))
                    y_train = np.concatenate((y_train, train_folds[j].labels.ravel()))
                    sw = np.concatenate((sw, train_folds[j].instance_weights))
                    true_ys = np.concatenate((true_ys, train_folds[j].labels.ravel()))
        
        X_test = scaler.transform(k_test.features)
        y_test = k_test.labels.ravel()
        clf.fit(X_train, y_train, sample_weight = sw)
        predictions = clf.predict(k_test.features)
        test_pred = k_test.copy()
        test_pred.labels = predictions
        y_preds = np.concatenate((y_preds, test_pred.labels))
        
        metric = ClassificationMetric(k_test, test_pred, unprivileged_groups=unpriv, privileged_groups=priv)
        eo = (metric.equal_opportunity_difference())
        metrics["Fold " + str(i+1) + " metrics"] = {"eq_opp_diff": eo}
        metrics["Fold " + str(i+1) + " metrics"]["accuracy"] = acc = sum(predictions==k_test.labels.ravel())/len(k_test.labels.ravel())
        metrics["Fold " + str(i+1) + " metrics"]["tradeoff"] = return_af_tradeoff(acc, eo)
        
        eq_opps = np.append(eq_opps, eo)
        accuracies = np.append(accuracies, acc)
        tradeoffs = np.append(tradeoffs, return_af_tradeoff(acc, eo))
        
        averages[0] += sum(predictions==k_test.labels.ravel())/len(k_test.labels.ravel())
        averages[1] += (metric.equal_opportunity_difference())
        averages[2] += return_af_tradeoff(acc, eo)
        
        k_test = train_folds[i]
        
    averages = {'mean_acc': averages[0]/k, 'avg_tradeoff': averages[2]/k, 'mean_eq_opp': averages[1]/k}  # Get the average of all the folds.
    ys = {'predicted_ys' : y_preds, 'true_ys' : true_ys}

        
    return metrics, averages, ys, eq_opps
        

In [136]:
def eval_log_reg(train, silence_print = False, rw = None):
    eo_avgs = []
    acc_avgs = []
    tradeoff_avgs=[]
    models = {}
    
    for solver in logreg_solvers:
        for c in C:
            metrics, averages, ys, eq_opps = eval_k_fold(train, clf = LogisticRegression(solver = solver, C = c), rw = rw)
            models[f'{solver}_LogReg_{c}'] = {'averages':averages}
            models[f'{solver}_LogReg_{c}']['metrics'] = metrics
            models[f'{solver}_LogReg_{c}']['ys'] = ys
            models[f'{solver}_LogReg_{c}']['eq_opps'] = eq_opps
            
            if not silence_print:
                print (f"Results for LogReg with C = {c}, and solver = {solver} : \n \n Averages: {averages}")
                print ("\n")
    return models

def eval_svc(train, silence_print = False, rw = None):
    eo_avgs = []
    acc_avgs = []
    tradeoff_avgs=[]
    models = {}
    
    for kernel in svm_kernels:
        for c in C:
            metrics, averages, ys, eq_opps = eval_k_fold(train, clf = SVC(kernel = kernel, C = c, gamma = g), rw = rw)
            models[f'{kernel}_SVC, c:{c}'] = {'averages':averages}
            models[f'{kernel}_SVC, c:{c}']['metrics'] = metrics
            models[f'{kernel}_SVC, c:{c}']['ys'] = ys
            models[f'{kernel}_SVC, c:{c}']['eq_opps'] = eq_opps

            if not silence_print:
                print (f"Results for SVC with C = {c}, gamma = {g} and kernel = {kernel} : \n \n Averages: {averages}")
                print ("\n")
    return models            

In [235]:
C = [0.00001, 0.002, 0.1, 0.5, 1, 2, 5]
logreg_solvers = ['newton-cg', 'liblinear']
g = 'auto'
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']

svc_models = eval_svc(train)
logreg_models = eval_log_reg(train)

Results for SVC with C = 1e-05, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7611804487762357, 'avg_tradeoff': 0.880590224388118, 'mean_eq_opp': 0.0}


Results for SVC with C = 0.002, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7611805942264068, 'avg_tradeoff': 0.8805902971132035, 'mean_eq_opp': 0.0}


Results for SVC with C = 0.1, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7877387259754284, 'avg_tradeoff': 0.8830924821313193, 'mean_eq_opp': 0.0017860368592069408}


Results for SVC with C = 0.5, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7877388029784601, 'avg_tradeoff': 0.8808220023333053, 'mean_eq_opp': 0.0022054832781169932}


Results for SVC with C = 1, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7877385805252575, 'avg_tradeoff': 0.8785066264268803, 'mean_eq_opp': 0.0012844402124119192}


Results for SVC with C = 2, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.787

In [319]:
def select_best(models):
    best = 0
    best_key = ''
    for key in models.keys():
        if best < models[key]['averages']['avg_tradeoff']:
            best = models[key]['averages']['avg_tradeoff']
            best_key = key
        else:
            pass
    return best_key, best

def select_fair(models):
    best = 0
    best_key = ''
    for key in models.keys():
        if best < (1 - (abs(models[key]['averages']['mean_eq_opp']))):
            best = models[key]['averages']['mean_eq_opp']
            best_key = key
        else:
            pass
    return best_key, best

def select_acc(models):
    best = 0
    best_key = ''
    for key in models.keys():
        if best < models[key]['averages']['mean_acc']:
            best = models[key]['averages']['mean_acc']
            best_key = key
        else:
            pass
    return best_key, best

def better(lg, svc, lg_val, svc_val):
    if lg_val > svc_val:
        winner = lg
    elif lg_val < svc_val:
        winner = svc

    return winner 

In [339]:
svc_best, svc_val = select_best(svc_models)
lg_best, lg_val = select_best(logreg_models)

svc_fair, svc_fair_val = select_fair(svc_models)
lg_fair, lg_fair_val = select_fair(logreg_models)

svc_acc, svc_acc_val = select_acc(svc_models)
lg_acc, lg_acc_val = select_acc(logreg_models)

print (better(lg_fair, svc_fair, lg_fair_val, svc_fair_val))
print (better(lg_acc, svc_acc, lg_acc_val, svc_acc_val))
print (better(lg_best, svc_best, lg_val, svc_val))

sigmoid_SVC, c:5
newton-cg_LogReg_2
linear_SVC, c:5


In [321]:
def test_selected_model(train, test, clf = None):
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(train.features)
    y_train = train.labels.ravel()
    X_test = scaler.transform(test.features)
    clf = clf
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_test)
    test_pred = test.copy()
    test_pred.labels = predictions
    acc = sum(predictions==y_test)/len(y_test)
    metric = ClassificationMetric(test, test_pred, unprivileged_groups=unpriv, privileged_groups=priv)
    eo = (metric.equal_opportunity_difference())
    tradeoff = return_af_tradeoff(acc, eo)
    return acc, eo, tradeoff

In [340]:
best_standard_adult = test_selected_model(train, test, clf = SVC(kernel = 'linear', C =5, gamma='auto'))
fair_standard_adult = test_selected_model(train, test, clf = SVC(kernel = 'sigmoid', C =5, gamma='auto'))
accurate_standard_adult = test_selected_model(train, test, clf = LogisticRegression(solver = 'newton-cg', C =2))

### Applying reweighting method

In [324]:
train, test = dataset_orig.split([0.7], shuffle=True)
rw = Reweighing(unprivileged_groups=unpriv,
                privileged_groups=priv)

rw_svc_models = eval_svc(train, silence_print = False, rw = rw) 
rw_logreg_models = eval_log_reg(train, silence_print = False, rw= rw)

Results for SVC with C = 1e-05, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7613267331468063, 'avg_tradeoff': 0.8806633665734032, 'mean_eq_opp': 0.0}


Results for SVC with C = 0.002, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7613267245909139, 'avg_tradeoff': 0.8806633622954569, 'mean_eq_opp': 0.0}


Results for SVC with C = 0.1, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7870367962837309, 'avg_tradeoff': 0.8754591435908019, 'mean_eq_opp': 0.015263651131620305}


Results for SVC with C = 0.5, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7870365567187433, 'avg_tradeoff': 0.8845714187042297, 'mean_eq_opp': 0.016480646550506163}


Results for SVC with C = 1, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.7870659547650823, 'avg_tradeoff': 0.8851047835182758, 'mean_eq_opp': 0.01685638772853081}


Results for SVC with C = 2, gamma = auto and kernel = linear : 
 
 Averages: {'mean_acc': 0.787065

In [338]:
svc_acc, svc_acc_val = select_acc(rw_svc_models)
lg_acc, lg_acc_val = select_acc(rw_logreg_models)

svc_fair, svc_fair_val = select_fair(rw_svc_models)
lg_fair, lg_fair_val = select_fair(rw_logreg_models)

svc_best, svc_best_val = select_best(rw_svc_models)
lg_best, lg_best_val = select_best(rw_logreg_models)

print (better(lg_acc, svc_acc, lg_acc_val, svc_acc_val))
print (better(lg_fair, svc_fair, lg_fair_val, svc_fair_val))
print (better(lg_best, svc_best, lg_best_val, svc_best_val))

poly_SVC, c:1
sigmoid_SVC, c:5
poly_SVC, c:5


In [341]:
accurate_rw_adult = test_selected_model(train, test, clf = SVC(kernel = 'poly', C =1, gamma = 'auto'))
fair_rw_adult = test_selected_model(train, test, clf = SVC(kernel = 'sigmoid', C =5, gamma='auto'))
best_rw_adult = test_selected_model(train, test, clf = SVC(kernel = 'poly', C =5, gamma='auto'))



# Part II:  GermanDataset

In [353]:
# cd \usr\lib\aif360\data\raw\german

In [91]:
from aif360.datasets import GermanDataset
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_german

In [92]:
data = GermanDataset()
dataset_orig = load_preproc_data_german(['age', 'sex'])

In [15]:
priv = [{'age': 1, 'sex' : 1}]
unpriv = [{'age': 0, 'sex': 0}]
train, test = dataset_orig.split([0.7], shuffle=True)
scaler = MinMaxScaler()

In [16]:
C = [0.00001, 0.002, 0.1, 0.5, 1, 2, 5]
logreg_solvers = ['newton-cg', 'liblinear']
g = 'auto'
svm_kernels = ['linear', 'poly', 'rbf', 'sigmoid']

In [17]:
logreg_models_german = eval_log_reg(train, silence_print = False)
svc_models_german = eval_svc(train, silence_print = False)


Results for LogReg with C = 1e-05, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7, 'avg_tradeoff': 0.85, 'mean_eq_opp': 0.0}


Results for LogReg with C = 0.002, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7, 'avg_tradeoff': 0.85, 'mean_eq_opp': 0.0}


Results for LogReg with C = 0.1, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7042857142857143, 'avg_tradeoff': 0.814642857142857, 'mean_eq_opp': -0.075}


Results for LogReg with C = 0.5, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.71, 'avg_tradeoff': 0.7367316017316017, 'mean_eq_opp': -0.23653679653679652}


Results for LogReg with C = 1, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7128571428571429, 'avg_tradeoff': 0.6697619047619048, 'mean_eq_opp': -0.37333333333333335}


Results for LogReg with C = 2, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7071428571428571, 'avg_tradeoff': 0.6841117216117215, 'mean_eq_opp': -0.33891941391941394}


Results for LogReg with C = 5, and solver

In [359]:
svc_best, svc_val = select_best(svc_models_german)
lg_best, lg_val = select_best(logreg_models_german)

svc_fair, svc_fair_val = select_fair(svc_models_german)
lg_fair, lg_fair_val = select_fair(logreg_models_german)

svc_acc, svc_acc_val = select_acc(svc_models_german)
lg_acc, lg_acc_val = select_acc(logreg_models_german)

print (better(lg_acc, svc_acc, lg_acc_val, svc_acc_val))
print (better(lg_fair, svc_fair, lg_fair_val, svc_fair_val))
print (better(lg_best, svc_best, lg_best_val, svc_best_val))

newton-cg_LogReg_1
sigmoid_SVC, c:5
rbf_SVC, c:5


In [362]:
best_standard_german = test_selected_model(train, test, clf = SVC(kernel = 'rbf', C =5))
fair_standard_german = test_selected_model(train, test, clf = SVC(kernel = 'sigmoid', C =5, gamma = 'auto'))
accurate_standard_german = test_selected_model(train, test, LogisticRegression(solver = 'newton-cg', C =1))

In [331]:
train, test = dataset_orig.split([0.7], shuffle=True)
rw = Reweighing(unprivileged_groups=unpriv,
                privileged_groups=priv)
rw_logreg_models_german = eval_log_reg(train, silence_print = False, rw = rw)
rw_svc_models_german = eval_svc(train, silence_print = False, rw = rw)


Results for LogReg with C = 1e-05, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.760478839930504, 'avg_tradeoff': 0.880239419965252, 'mean_eq_opp': 0.0}


Results for LogReg with C = 0.002, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7710371705184652, 'avg_tradeoff': 0.8803105407010469, 'mean_eq_opp': -0.010416089116371634}


Results for LogReg with C = 0.1, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7904586570080908, 'avg_tradeoff': 0.8823027010358657, 'mean_eq_opp': 0.012094496453721492}


Results for LogReg with C = 0.5, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7914535532899267, 'avg_tradeoff': 0.8759903101849469, 'mean_eq_opp': -0.025273719317228528}


Results for LogReg with C = 1, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7903417022367198, 'avg_tradeoff': 0.885165170496075, 'mean_eq_opp': 0.009887582928353945}


Results for LogReg with C = 2, and solver = newton-cg : 
 
 Averages: {'mean_acc': 0.7905468939265698, 'avg_tradeoff

KeyboardInterrupt: 

In [363]:
svc_best, svc_val = select_best(rw_svc_models_german)
lg_best, lg_val = select_best(rw_logreg_models_german)

svc_fair, svc_fair_val = select_fair(rw_svc_models_german)
lg_fair, lg_fair_val = select_fair(rw_logreg_models_german)

svc_acc, svc_acc_val = select_acc(rw_svc_models_german)
lg_acc, lg_acc_val = select_acc(rw_logreg_models_german)


print (better(lg_acc, svc_acc, lg_acc_val, svc_acc_val))
print (better(lg_fair, svc_fair, lg_fair_val, svc_fair_val))
print (better(lg_best, svc_best, lg_best_val, svc_best_val))

newton-cg_LogReg_0.5
sigmoid_SVC, c:5
poly_SVC, c:0.002


In [367]:
accurate_rw_german = test_selected_model(train, test, clf = LogisticRegression(solver = 'newton-cg', C =0.5))
fair_rw_german = test_selected_model(train, test, clf = SVC(kernel = 'linear', C =5, gamma='auto'))
best_rw_german = test_selected_model(train, test, clf = SVC(kernel = 'poly', C =0.002, gamma='auto'))



KeyboardInterrupt: 

In [368]:
final_reports_adult = {'Standard_Fair': {'Sigmoid_SVC, C = 5':fair_standard_adult}, 'Standard_Accurate': {'poly_SVC, C = 1': accurate_standard_adult}, 'Standard_Best':{'Poly_SVC, C = 5':best_standard_adult} , 'RW_Fair':{'Sigmoid_SVC, C = 5':fair_rw_adult} , 'RW_Accurate':{'Poly_SVC, C = 5':accurate_rw_adult} , 'RW_Best':{'Poly_SVC, C = 5':best_rw_adult} }
final_reports_german = {'Standard_Fair': {'Sigmoid_SVC, C=5':fair_standard_german}, 'Standard_Accurate': {'Newton-cg_LogReg, C=1': accurate_standard_german}, 'Standard_Best':{'rbf_SVC, C=5':best_standard_german} , 'RW_Fair':{'Sigmoid_SVC, C=0.5':fair_rw_german} , 'RW_Accurate':{'Newton-cg_LogReg, C= 0.5':accurate_rw_german} , 'RW_Best':{'Poly_SVC, C=0.002':best_rw_german} }


In [369]:
df_adult = pd.DataFrame.from_dict(final_reports_adult)
df_german = pd.DataFrame.from_dict(final_reports_german)

In [370]:
df_adult.T

Unnamed: 0,"Sigmoid_SVC, C = 5","poly_SVC, C = 1","Poly_SVC, C = 5"
Standard_Fair,"(0.6617757455811096, -0.09851645964232264, 0.7...",,
Standard_Accurate,,"(0.6908482904524671, -0.44508279824264957, 0.6...",
Standard_Best,,,"(0.6584999658772948, -0.008779855301363548, 0...."
RW_Fair,"(0.6617757455811096, -0.09851645964232264, 0.7...",,
RW_Accurate,,,"(0.6901658363475056, -0.44711051030753635, 0.6..."
RW_Best,,,"(0.6859346208967447, -0.45927678269685707, 0.6..."


In [371]:
df_german.T

Unnamed: 0,"Sigmoid_SVC, C=5","Newton-cg_LogReg, C=1","rbf_SVC, C=5","Sigmoid_SVC, C=0.5","Newton-cg_LogReg, C= 0.5","Poly_SVC, C=0.002"
Standard_Fair,"(0.6617757455811096, -0.09851645964232264, 0.7...",,,,,
Standard_Accurate,,"(0.6908482904524671, -0.44508279824264957, 0.6...",,,,
Standard_Best,,,"(0.6890056643690712, -0.454883406556269, 0.617...",,,
RW_Fair,,,,"(0.6584999658772948, -0.008779855301363548, 0....",,
RW_Accurate,,,,,"(0.6908482904524671, -0.44508279824264957, 0.6...",
RW_Best,,,,,,"(0.7624377260629223, 0.0, 0.8812188630314611)"


### Model Selection