In [1]:
import numpy as np

In [2]:
from jenga.tasks.income import IncomeEstimationTask

task = IncomeEstimationTask(seed=42)

In [3]:
model = task.fit_baseline_model()

In [4]:
f"The ROC AUC score on the test data is {task.get_baseline_performance()}"

'The ROC AUC score on the test data is 0.8835656042788887'

In [5]:
task.test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
23038,Self-emp-not-inc,Transport-moving,Separated,HS-grad,70,35
30795,Private,Craft-repair,Married-civ-spouse,HS-grad,45,34
10734,Private,Craft-repair,Married-civ-spouse,Some-college,45,35
9733,Private,Adm-clerical,Widowed,HS-grad,24,69
13522,Federal-gov,Tech-support,Never-married,Some-college,24,20
...,...,...,...,...,...,...
23795,Private,Sales,Divorced,HS-grad,25,31
8091,Private,Exec-managerial,Married-civ-spouse,Masters,45,59
29601,,,Married-civ-spouse,HS-grad,20,65
26017,Private,Transport-moving,Never-married,Some-college,50,25


### Missing values in the 'age' column

In [6]:
from jenga.corruptions.generic import MissingValues

age_corruption = MissingValues(column='age', fraction=0.8, na_value=-1.0)

corrupted_test_data = age_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8702776815283744'

Cleaning via mean imputation

In [7]:
mean_age = np.mean(task.train_data.age.values)
mean_age

38.601082616707615

In [8]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data.loc[clean_test_data['age'] < 0, 'age'] = mean_age
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
23038,Self-emp-not-inc,Transport-moving,Separated,HS-grad,70,38.601083
30795,Private,Craft-repair,Married-civ-spouse,HS-grad,45,38.601083
10734,Private,Craft-repair,Married-civ-spouse,Some-college,45,38.601083
9733,Private,Adm-clerical,Widowed,HS-grad,24,69.000000
13522,Federal-gov,Tech-support,Never-married,Some-college,24,20.000000
...,...,...,...,...,...,...
23795,Private,Sales,Divorced,HS-grad,25,38.601083
8091,Private,Exec-managerial,Married-civ-spouse,Masters,45,38.601083
29601,,,Married-civ-spouse,HS-grad,20,38.601083
26017,Private,Transport-moving,Never-married,Some-college,50,38.601083


In [9]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8784751318388343'

### Missing values in 'marital_status'

In [10]:
from jenga.corruptions.generic import MissingValues

marital_status_corruption = MissingValues(column='marital_status', fraction=0.8, na_value=np.nan)

In [11]:
corrupted_test_data = marital_status_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8173750037271653'

In [12]:
corrupted_test_data 

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
23038,Self-emp-not-inc,Transport-moving,,HS-grad,70,35
30795,Private,Craft-repair,Married-civ-spouse,HS-grad,45,34
10734,Private,Craft-repair,,Some-college,45,35
9733,Private,Adm-clerical,,HS-grad,24,69
13522,Federal-gov,Tech-support,Never-married,Some-college,24,20
...,...,...,...,...,...,...
23795,Private,Sales,,HS-grad,25,31
8091,Private,Exec-managerial,Married-civ-spouse,Masters,45,59
29601,,,,HS-grad,20,65
26017,Private,Transport-moving,,Some-college,50,25


Cleaning via mode imputation

In [13]:
task.train_data['marital_status'].value_counts()

Married-civ-spouse       11996
Never-married             8545
Divorced                  3540
Separated                  824
Widowed                    791
Married-spouse-absent      332
Married-AF-spouse           20
Name: marital_status, dtype: int64

In [14]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data['marital_status'].fillna('Married-civ-spouse', inplace=True)
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
23038,Self-emp-not-inc,Transport-moving,Married-civ-spouse,HS-grad,70,35
30795,Private,Craft-repair,Married-civ-spouse,HS-grad,45,34
10734,Private,Craft-repair,Married-civ-spouse,Some-college,45,35
9733,Private,Adm-clerical,Married-civ-spouse,HS-grad,24,69
13522,Federal-gov,Tech-support,Never-married,Some-college,24,20
...,...,...,...,...,...,...
23795,Private,Sales,Married-civ-spouse,HS-grad,25,31
8091,Private,Exec-managerial,Married-civ-spouse,Masters,45,59
29601,,,Married-civ-spouse,HS-grad,20,65
26017,Private,Transport-moving,Married-civ-spouse,Some-college,50,25


In [15]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8173750037271653'