In [1]:
import numpy as np

In [2]:
from jenga.tasks.income import IncomeEstimationTask

task = IncomeEstimationTask(seed=42)

In [3]:
model = task.fit_baseline_model()

In [4]:
f"The ROC AUC score on the test data is {task.get_baseline_performance()}"

'The ROC AUC score on the test data is 0.8771970030335221'

In [5]:
task.test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
16309,Private,Tech-support,Married-civ-spouse,HS-grad,40,38
29284,Private,Priv-house-serv,Never-married,5th-6th,40,27
29738,Self-emp-not-inc,Sales,Never-married,HS-grad,40,31
24747,Private,Tech-support,Never-married,HS-grad,10,18
30140,Private,Adm-clerical,Never-married,Some-college,25,21
...,...,...,...,...,...,...
5020,,,Never-married,Some-college,20,18
5002,Private,Exec-managerial,Married-civ-spouse,HS-grad,40,26
1156,Private,Exec-managerial,Married-civ-spouse,HS-grad,45,46
1173,Private,Other-service,Never-married,Assoc-voc,30,20


### Missing values in the 'age' column

In [6]:
from jenga.corruptions.generic import MissingValues

age_corruption = MissingValues(column='age', fraction=0.8, na_value=-1.0)

corrupted_test_data = age_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8529848786552953'

Cleaning via mean imputation

In [7]:
mean_age = np.mean(task.train_data.age.values)
mean_age

38.593596437346434

In [8]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data.loc[clean_test_data['age'] < 0, 'age'] = mean_age
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
16309,Private,Tech-support,Married-civ-spouse,HS-grad,40,38.593596
29284,Private,Priv-house-serv,Never-married,5th-6th,40,38.593596
29738,Self-emp-not-inc,Sales,Never-married,HS-grad,40,38.593596
24747,Private,Tech-support,Never-married,HS-grad,10,18.000000
30140,Private,Adm-clerical,Never-married,Some-college,25,21.000000
...,...,...,...,...,...,...
5020,,,Never-married,Some-college,20,38.593596
5002,Private,Exec-managerial,Married-civ-spouse,HS-grad,40,38.593596
1156,Private,Exec-managerial,Married-civ-spouse,HS-grad,45,38.593596
1173,Private,Other-service,Never-married,Assoc-voc,30,38.593596


In [9]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8768871175335453'

### Missing values in 'marital_status'

In [10]:
from jenga.corruptions.generic import MissingValues

marital_status_corruption = MissingValues(column='marital_status', fraction=0.8, na_value=np.nan)

In [11]:
corrupted_test_data = marital_status_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8127799042685139'

In [12]:
corrupted_test_data 

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
16309,Private,Tech-support,,HS-grad,40,38
29284,Private,Priv-house-serv,Never-married,5th-6th,40,27
29738,Self-emp-not-inc,Sales,,HS-grad,40,31
24747,Private,Tech-support,,HS-grad,10,18
30140,Private,Adm-clerical,Never-married,Some-college,25,21
...,...,...,...,...,...,...
5020,,,,Some-college,20,18
5002,Private,Exec-managerial,Married-civ-spouse,HS-grad,40,26
1156,Private,Exec-managerial,,HS-grad,45,46
1173,Private,Other-service,,Assoc-voc,30,20


Cleaning via mode imputation

In [13]:
task.train_data['marital_status'].value_counts()

Married-civ-spouse       12003
Never-married             8525
Divorced                  3569
Separated                  814
Widowed                    786
Married-spouse-absent      335
Married-AF-spouse           16
Name: marital_status, dtype: int64

In [14]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data['marital_status'].fillna('Married-civ-spouse', inplace=True)
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
16309,Private,Tech-support,Married-civ-spouse,HS-grad,40,38
29284,Private,Priv-house-serv,Never-married,5th-6th,40,27
29738,Self-emp-not-inc,Sales,Married-civ-spouse,HS-grad,40,31
24747,Private,Tech-support,Married-civ-spouse,HS-grad,10,18
30140,Private,Adm-clerical,Never-married,Some-college,25,21
...,...,...,...,...,...,...
5020,,,Married-civ-spouse,Some-college,20,18
5002,Private,Exec-managerial,Married-civ-spouse,HS-grad,40,26
1156,Private,Exec-managerial,Married-civ-spouse,HS-grad,45,46
1173,Private,Other-service,Married-civ-spouse,Assoc-voc,30,20


In [15]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8127799042685139'