In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.tree import DecisionTreeClassifier
import tqdm

In [27]:
sensitive_groups = {'Sex': ['persoon_geslacht_vrouw'],
                    'Nationality': ['persoonlijke_eigenschappen_dagen_sinds_taaleis', 'persoonlijke_eigenschappen_taaleis_schrijfv_ok', 'persoonlijke_eigenschappen_taaleis_voldaan', 'persoonlijke_eigenschappen_nl_begrijpen3', 'persoonlijke_eigenschappen_nl_lezen3', 
                                    'persoonlijke_eigenschappen_nl_lezen4', 'persoonlijke_eigenschappen_nl_schrijven0', 'persoonlijke_eigenschappen_nl_schrijven1', 'persoonlijke_eigenschappen_nl_schrijven2', 'persoonlijke_eigenschappen_nl_schrijven3', 'persoonlijke_eigenschappen_nl_schrijvenfalse',
                                    'persoonlijke_eigenschappen_nl_spreken1', 'persoonlijke_eigenschappen_nl_spreken2', 'persoonlijke_eigenschappen_nl_spreken3', 'belemmering_hist_taal', 'contacten_onderwerp_boolean_taaleis___voldoet', 'persoonlijke_eigenschappen_spreektaal', 
                                    'persoonlijke_eigenschappen_spreektaal_anders', 'contacten_onderwerp_boolean_beoordelen_taaleis'], 
                    'Marital Status': ['relatie_partner_huidige_partner___partner__gehuwd_', 'relatie_partner_totaal_dagen_partner'],
                    'Disablity': ['ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden','ontheffing_reden_hist_medische_gronden'], 
                    'Age': ['persoon_leeftijd_bij_onderzoek']}

In [3]:
X_test_complete_thr = joblib.load("X_test_complete_thr.pkl")
y_test_complete_thr = joblib.load("y_test_complete_thr.pkl")

In [28]:
scaler = StandardScaler()

In [29]:
df = pd.read_csv("investigation_train_large_checked.csv", na_values='?')

In [30]:
X = df.drop(['checked', 'Ja', 'Nee'],  axis=1)
y = df['checked']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
svm_basic = SVC(probability=False) #remove probability=True if we do not want thresholding. It will train faster.
svm_basic.fit(X_train_scaled, y_train)
predict_basic = svm_basic.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, predict_basic))
print(accuracy_score(y_test_complete_thr, predict_basic))
joblib.dump(svm_basic, "svm_basic.pkl")

              precision    recall  f1-score   support

       False       0.91      0.99      0.95     13834
        True       0.85      0.45      0.59      2416

    accuracy                           0.91     16250
   macro avg       0.88      0.72      0.77     16250
weighted avg       0.90      0.91      0.89     16250

0.9065230769230769


['svm_basic.pkl']

In [31]:
df_features_zeroed = df.copy()
for column in df_features_zeroed.columns:
    for vals in sensitive_groups.values():
        if column in vals:
            df_features_zeroed[column] = df_features_zeroed[column] * 0
        
X_features_zeroed = df_features_zeroed.drop(['checked', 'Ja', 'Nee'],  axis=1)
y_features_zeroed = df_features_zeroed['checked']
        
X_train_fz, X_test_fz, y_train_fz, y_test_fz = train_test_split(X_features_zeroed, y_features_zeroed, test_size=0.25, random_state=42)

X_train_scaled_fz = scaler.fit_transform(X_train_fz)
X_test_scaled_fz = scaler.transform(X_test_fz)

In [14]:
svm_features_zero = SVC(probability=False) #remove probability=True if we do not want thresholding. It will train faster.
svm_features_zero.fit(X_train_scaled_fz, y_train_fz)
predict_features_zero = svm_features_zero.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, predict_features_zero))
print(accuracy_score(y_test_complete_thr, predict_features_zero))
joblib.dump(svm_features_zero, "svm_features_zero.pkl")

              precision    recall  f1-score   support

       False       0.93      0.98      0.95     13834
        True       0.82      0.55      0.66      2416

    accuracy                           0.92     16250
   macro avg       0.87      0.76      0.80     16250
weighted avg       0.91      0.92      0.91     16250

0.9150769230769231


['svm_features_zero.pkl']

In [32]:
synth_data = pd.read_csv("C:/Academics/TU Delft/Quarter 2/Software Engineering and Testing for AI Systems/Social-Welfare-Dataset/data/01_raw/synth_data_train_labeled.csv")
synth_data["checked"] = (synth_data["Ja"] > 0.7).astype(bool)
synth_data.head()

Unnamed: 0,adres_aantal_brp_adres,adres_aantal_verschillende_wijken,adres_aantal_verzendadres,adres_aantal_woonadres_handmatig,adres_dagen_op_adres,adres_recentst_onderdeel_rdam,adres_recentste_buurt_groot_ijsselmonde,adres_recentste_buurt_nieuwe_westen,adres_recentste_buurt_other,adres_recentste_buurt_oude_noorden,...,typering_hist_ind,typering_hist_sector_zorg,typering_ind,typering_indicatie_geheime_gegevens,typering_other,typering_transport__logistiek___tuinbouw,typering_zorg__schoonmaak___welzijn,Ja,Nee,checked
0,2,1,0,1,17740,1,0,0,0,0,...,1,0,1,0,1,0,0,0.732221,0.267779,True
1,4,2,0,0,473,1,0,0,0,0,...,1,0,0,0,0,0,0,0.379286,0.620714,False
2,2,1,0,1,3498,1,0,0,0,0,...,1,0,0,0,0,0,0,0.449903,0.550097,False
3,5,3,0,1,5441,1,0,0,0,0,...,1,0,1,0,0,0,0,0.480557,0.519443,False
4,1,1,0,0,22522,1,0,0,1,0,...,1,0,0,0,0,0,0,0.54039,0.45961,False


In [33]:
filtered_good_data = pd.DataFrame(synth_data.loc[((synth_data['checked']==True) & (synth_data['persoonlijke_eigenschappen_taaleis_schrijfv_ok']==1)) | ((synth_data['checked'] == True) & (synth_data['persoon_leeftijd_bij_onderzoek']>=30))])

In [16]:

filtered_good_data.describe()

Unnamed: 0,adres_aantal_brp_adres,adres_aantal_verschillende_wijken,adres_aantal_verzendadres,adres_aantal_woonadres_handmatig,adres_dagen_op_adres,adres_recentst_onderdeel_rdam,adres_recentste_buurt_groot_ijsselmonde,adres_recentste_buurt_nieuwe_westen,adres_recentste_buurt_other,adres_recentste_buurt_oude_noorden,...,typering_hist_inburgeringsbehoeftig,typering_hist_ind,typering_hist_sector_zorg,typering_ind,typering_indicatie_geheime_gegevens,typering_other,typering_transport__logistiek___tuinbouw,typering_zorg__schoonmaak___welzijn,Ja,Nee
count,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,...,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0,6572.0
mean,3.128728,2.232958,0.38451,0.572581,8440.243457,0.943396,0.00213,0.0035,0.50426,0.000761,...,0.005173,1.0,0.008521,0.718503,0.068016,0.47535,0.01476,0.008369,0.729755,0.270245
std,1.588513,0.990008,0.517733,0.583909,6121.938095,0.231102,0.046109,0.059059,0.50002,0.027574,...,0.071746,0.0,0.091922,0.449764,0.251792,0.515622,0.120598,0.091105,0.023427,0.023427
min,1.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.700004,0.156573
25%,2.0,2.0,0.0,0.0,3285.75,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.711066,0.256736
50%,3.0,2.0,0.0,1.0,7223.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.724445,0.275555
75%,4.0,3.0,1.0,1.0,12607.5,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.743264,0.288934
max,11.0,7.0,3.0,3.0,24331.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.843427,0.299996


In [11]:
filtered_good_data["checked"].value_counts()

checked
True    6572
Name: count, dtype: int64

In [34]:
X_gd = filtered_good_data.drop(['checked', 'Ja', 'Nee'],  axis=1)
y_gd = filtered_good_data['checked']
good_data_X = pd.concat([X_train, X_gd])
good_data_y = pd.concat([y_train, y_gd])
#TODO SANKALP This should be a combination of the extra generated data, and the already existing X_train, y_train. The checked, ja, nee columns should be cut from the extra generated data
#something like
#X_gd = extra_data.drop(['checked', 'Ja', 'Nee'],  axis=1)
#y_gd = extra_data['checked'] 
#good_data_X = X_gd + X_train
#good_data_y = y_gd + y_train


In [35]:
X_train_gd, y_train_gd = good_data_X.copy(), good_data_y.copy()
X_test_gd, y_test_gd = X_test.copy(), y_test.copy()

X_train_scaled_gd = scaler.fit_transform(X_train_gd)
X_test_scaled_gd = scaler.transform(X_test_gd)

In [21]:
svm_gd = SVC(probability=False) #remove probability=True if we do not want thresholding. It will train faster.
svm_gd.fit(X_train_scaled_gd, y_train_gd)
predict_gd = svm_gd.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, predict_gd))
print(accuracy_score(y_test_complete_thr, predict_gd))
joblib.dump(svm_gd, "svm_gd.pkl")

              precision    recall  f1-score   support

       False       0.93      0.96      0.95     13834
        True       0.75      0.60      0.67      2416

    accuracy                           0.91     16250
   macro avg       0.84      0.78      0.81     16250
weighted avg       0.91      0.91      0.91     16250

0.9113230769230769


['svm_gd.pkl']

In [36]:
X_train_scaled_gd_df = pd.DataFrame(X_train_scaled_gd, columns=X_train_gd.columns)

In [37]:
X_train_complete = X_train_scaled_gd_df.copy()
for column in X_train_complete.columns:
    for vals in sensitive_groups.values():
        if column in vals:
            X_train_complete[column] = X_train_complete[column] * 0
y_train_complete = y_train_gd.copy()
X_test_complete = X_test_scaled_fz.copy()
y_test_complete = y_test_gd.copy()

In [24]:
svm_complete = SVC(probability=False) #remove probability=True if we do not want thresholding. It will train faster.
svm_complete.fit(X_train_complete, y_train_complete)
predict_complete = svm_complete.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, predict_complete))
print(accuracy_score(y_test_complete_thr, predict_complete))
joblib.dump(svm_complete, "svm_complete.pkl")



              precision    recall  f1-score   support

       False       0.95      0.95      0.95     13834
        True       0.70      0.69      0.70      2416

    accuracy                           0.91     16250
   macro avg       0.82      0.82      0.82     16250
weighted avg       0.91      0.91      0.91     16250

0.9097846153846154


['svm_complete.pkl']

In [38]:
X_train_complete, y_train_complete, 
X_test_complete_thr, X_validation_complete_thr, y_test_complete_thr, y_validation_complete_thr = train_test_split(X_test_complete, y_test_complete, test_size=0.5, random_state=42)

In [26]:
svm_complete_thr = SVC(probability=True) #remove probability=True if we do not want thresholding. It will train faster.
svm_complete_thr.fit(X_train_complete, y_train_complete)
predict_complete_thr = svm_complete_thr.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, predict_complete_thr))
print(accuracy_score(y_test_complete_thr, predict_complete_thr))
joblib.dump(svm_complete_thr, "svm_complete_for_thresholding.pkl")



              precision    recall  f1-score   support

       False       0.95      0.95      0.95     13834
        True       0.70      0.69      0.70      2416

    accuracy                           0.91     16250
   macro avg       0.82      0.82      0.82     16250
weighted avg       0.91      0.91      0.91     16250

0.9097846153846154


['svm_complete_for_thresholding.pkl']

In [59]:
dtree_basic = DecisionTreeClassifier(random_state=42)
dtree_basic.fit(X_train_scaled, y_train)
y_pred_bad_model = dtree_basic.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, y_pred_bad_model))
print(accuracy_score(y_test_complete_thr, y_pred_bad_model))
joblib.dump(dtree_basic, "dtree_basic.pkl")

              precision    recall  f1-score   support

       False       0.88      0.96      0.92     13834
        True       0.51      0.27      0.35      2416

    accuracy                           0.85     16250
   macro avg       0.70      0.61      0.63     16250
weighted avg       0.83      0.85      0.83     16250

0.8529846153846153


['dtree_basic.pkl']

In [None]:
adaboost_basic = AdaBoostClassifier(random_state=42)

In [49]:
# bias against young people who identify as male
filtered_bad_data = pd.DataFrame(synth_data[((synth_data['checked']==True) & (synth_data['persoon_geslacht_vrouw']==0)) & ((synth_data['checked'] == True) & (synth_data['persoon_leeftijd_bij_onderzoek']<=30))])

In [50]:
X_bd = filtered_bad_data.drop(['checked', 'Ja', 'Nee'],  axis=1)
y_bd = filtered_bad_data['checked']

In [51]:
#TODO SANKALP Same as before but bad data evil data bad hurr durr
bad_data_X = pd.concat([X_train, X_bd])
bad_data_y = pd.concat([y_train, y_bd])

In [52]:
X_train_bd, y_train_bd = bad_data_X.copy(), bad_data_y.copy()
X_test_bd, y_test_bd = X_test.copy(), y_test.copy()
X_train_scaled_bd = scaler.fit_transform(X_train_bd)
X_test_scaled_bd = scaler.transform(X_test_bd)

In [None]:
dtree_bd = DecisionTreeClassifier(random_state=42)
dtree_bd.fit(X_train_scaled_bd, y_train_bd)
y_pred_bad_model_bd = dtree_bd.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, y_pred_bad_model_bd))
print(accuracy_score(y_test_complete_thr, y_pred_bad_model_bd))
joblib.dump(dtree_bd, "dtree_bd.pkl")

              precision    recall  f1-score   support

       False       0.88      0.96      0.92     13834
        True       0.53      0.26      0.35      2416

    accuracy                           0.86     16250
   macro avg       0.70      0.61      0.64     16250
weighted avg       0.83      0.86      0.83     16250

0.8552


['dtree_bd.pkl']

In [13]:
from tqdm import tqdm
def mutation_tests(name, mutations):
    # Train the mutated model
    mutated_accuracies = []
    for i in tqdm(range(len(mutations))):
        mutated_model = mutations[i]
        print("Mutated model: ", mutated_model)
        mutated_model.fit(X_train_scaled, y_train)
        y_pred_mutated = mutated_model.predict(X_test_complete_thr)
        mutated_accuracy = accuracy_score(y_test_complete_thr, y_pred_mutated)
        print(mutated_accuracy)
        mutated_accuracies.append(mutated_accuracy)
        filename = "{0}_mutation_{1}.pkl".format(name, i)
        joblib.dump(mutated_model, filename)
    return mutated_accuracies

In [34]:
joblib.dump(X_test_complete_thr, "X_test_complete_thr.pkl")
joblib.dump(y_test_complete_thr, "y_test_complete_thr.pkl")

['y_test_complete_thr.pkl']

In [22]:
joblib.dump(X_validation_complete_thr, "X_validation_complete_thr.pkl")
joblib.dump(y_validation_complete_thr, "y_validation_complete_thr.pkl")

['y_validation_complete_thr.pkl']

In [60]:
joblib.dump(X_train_scaled_bd, "X_train_scaled_bd.pkl")
joblib.dump(y_train_bd, "y_train_bd.pkl")

['y_train_bd.pkl']

In [66]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_bd = AdaBoostClassifier(random_state=42, learning_rate=0.6, estimator=DecisionTreeClassifier(max_depth=3))
adaboost_bd.fit(X_train_bd, y_train_bd)
ada_predict_bd = adaboost_bd.predict(X_test_complete_thr)
print(classification_report(y_test_complete_thr, ada_predict_bd))
print(accuracy_score(y_test_complete_thr, ada_predict_bd))




              precision    recall  f1-score   support

       False       0.94      0.86      0.90     13834
        True       0.46      0.70      0.56      2416

    accuracy                           0.83     16250
   macro avg       0.70      0.78      0.73     16250
weighted avg       0.87      0.83      0.85     16250

0.8340307692307692


In [63]:
joblib.dump(adaboost_bd, "ada_predict_bd.pkl")

['ada_predict_bd.pkl']

In [15]:
mutations_svc = [SVC(kernel='poly'), SVC(C=2), SVC(gamma='auto')]
mutated_accuracies_SVC = mutation_tests("SVC", mutations=mutations_svc)

  0%|          | 0/3 [00:00<?, ?it/s]

Mutated model:  SVC(kernel='poly')


 33%|███▎      | 1/3 [51:50<1:43:41, 3110.54s/it]

0.8822153846153846
Mutated model:  SVC(C=2)


 67%|██████▋   | 2/3 [1:29:14<43:21, 2601.00s/it]

0.9074461538461538
Mutated model:  SVC(gamma='auto')


100%|██████████| 3/3 [1:51:39<00:00, 2233.17s/it]

0.9065846153846154





In [14]:
mutations_dtree = [DecisionTreeClassifier(max_depth=2, random_state=42), DecisionTreeClassifier(criterion='entropy', random_state=42), DecisionTreeClassifier(random_state=1)]
mutated_accuracies_dtree = mutation_tests("dtree", mutations=mutations_dtree)

  0%|          | 0/3 [00:00<?, ?it/s]

Mutated model:  DecisionTreeClassifier(max_depth=2, random_state=42)


 33%|███▎      | 1/3 [00:00<00:01,  1.38it/s]

0.851323076923077
Mutated model:  DecisionTreeClassifier(criterion='entropy', random_state=42)


 67%|██████▋   | 2/3 [00:05<00:03,  3.35s/it]

0.8540923076923077
Mutated model:  DecisionTreeClassifier(random_state=1)


100%|██████████| 3/3 [00:13<00:00,  4.35s/it]

0.8531076923076923



