In [1]:
import numpy as np

In [2]:
from jenga.tasks.income import IncomeEstimationTask

task = IncomeEstimationTask(seed=42)

In [3]:
model = task.fit_baseline_model()

In [4]:
f"The ROC AUC score on the test data is {task.get_baseline_performance()}"

'The ROC AUC score on the test data is 0.8835845614128887'

In [5]:
task.test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
20470,Private,Sales,Divorced,11th,20,19
25039,Private,Sales,Married-civ-spouse,HS-grad,45,33
10752,Local-gov,Sales,Married-civ-spouse,Bachelors,40,45
24062,Private,Prof-specialty,Married-civ-spouse,Some-college,40,24
17646,Private,Prof-specialty,Married-civ-spouse,Bachelors,15,41
...,...,...,...,...,...,...
5679,,,Married-civ-spouse,Bachelors,10,76
28363,Self-emp-inc,Sales,Married-civ-spouse,Some-college,90,52
7694,Self-emp-inc,Exec-managerial,Married-civ-spouse,Some-college,50,37
283,Private,Prof-specialty,Divorced,Bachelors,42,60


### Missing values in the 'age' column

In [6]:
from jenga.corruptions.generic import MissingValues

age_corruption = MissingValues(column='age', fraction=0.8, na_value=-1.0)

corrupted_test_data = age_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8687577049503954'

Cleaning via mean imputation

In [7]:
mean_age = np.mean(task.train_data.age.values)
mean_age

38.56979422604422

In [8]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data.loc[clean_test_data['age'] < 0, 'age'] = mean_age
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
20470,Private,Sales,Divorced,11th,20,38.569794
25039,Private,Sales,Married-civ-spouse,HS-grad,45,38.569794
10752,Local-gov,Sales,Married-civ-spouse,Bachelors,40,38.569794
24062,Private,Prof-specialty,Married-civ-spouse,Some-college,40,24.000000
17646,Private,Prof-specialty,Married-civ-spouse,Bachelors,15,41.000000
...,...,...,...,...,...,...
5679,,,Married-civ-spouse,Bachelors,10,38.569794
28363,Self-emp-inc,Sales,Married-civ-spouse,Some-college,90,38.569794
7694,Self-emp-inc,Exec-managerial,Married-civ-spouse,Some-college,50,38.569794
283,Private,Prof-specialty,Divorced,Bachelors,42,38.569794


In [9]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8788907107729987'

### Missing values in 'marital_status'

In [10]:
from jenga.corruptions.generic import MissingValues

marital_status_corruption = MissingValues(column='marital_status', fraction=0.8, na_value=np.nan)

In [11]:
corrupted_test_data = marital_status_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8220291407778619'

In [12]:
corrupted_test_data 

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
20470,Private,Sales,,11th,20,19
25039,Private,Sales,Married-civ-spouse,HS-grad,45,33
10752,Local-gov,Sales,,Bachelors,40,45
24062,Private,Prof-specialty,,Some-college,40,24
17646,Private,Prof-specialty,Married-civ-spouse,Bachelors,15,41
...,...,...,...,...,...,...
5679,,,,Bachelors,10,76
28363,Self-emp-inc,Sales,Married-civ-spouse,Some-college,90,52
7694,Self-emp-inc,Exec-managerial,,Some-college,50,37
283,Private,Prof-specialty,,Bachelors,42,60


Cleaning via mode imputation

In [13]:
task.train_data['marital_status'].value_counts()

Married-civ-spouse       11996
Never-married             8536
Divorced                  3541
Separated                  815
Widowed                    790
Married-spouse-absent      352
Married-AF-spouse           18
Name: marital_status, dtype: int64

In [14]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data['marital_status'].fillna('Married-civ-spouse', inplace=True)
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
20470,Private,Sales,Married-civ-spouse,11th,20,19
25039,Private,Sales,Married-civ-spouse,HS-grad,45,33
10752,Local-gov,Sales,Married-civ-spouse,Bachelors,40,45
24062,Private,Prof-specialty,Married-civ-spouse,Some-college,40,24
17646,Private,Prof-specialty,Married-civ-spouse,Bachelors,15,41
...,...,...,...,...,...,...
5679,,,Married-civ-spouse,Bachelors,10,76
28363,Self-emp-inc,Sales,Married-civ-spouse,Some-college,90,52
7694,Self-emp-inc,Exec-managerial,Married-civ-spouse,Some-college,50,37
283,Private,Prof-specialty,Married-civ-spouse,Bachelors,42,60


In [15]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8163766375108087'