In [1]:
import numpy as np

In [2]:
from jenga.tasks.income import IncomeEstimationTask

task = IncomeEstimationTask(seed=42)

In [3]:
model = task.fit_baseline_model(task.train_data, task.train_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.2s finished


In [4]:
y_pred = model.predict_proba(task.test_data)

f"The ROC AUC score on the test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the test data is 0.8868040711646413'

In [5]:
task.test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
5595,Private,Craft-repair,Married-civ-spouse,HS-grad,45,60
29818,Private,Other-service,Divorced,HS-grad,40,24
480,Private,Craft-repair,Married-civ-spouse,Assoc-acdm,45,33
28532,Private,Machine-op-inspct,Married-civ-spouse,11th,40,52
32419,Private,Exec-managerial,Married-civ-spouse,Bachelors,55,36
...,...,...,...,...,...,...
7854,Self-emp-not-inc,Farming-fishing,Never-married,Some-college,50,25
7946,Private,Machine-op-inspct,Divorced,Some-college,40,29
12168,Federal-gov,Handlers-cleaners,Divorced,Some-college,40,38
30644,Private,Other-service,Never-married,Assoc-acdm,30,37


### Missing values in the 'age' column

In [6]:
from jenga.corruptions.generic import MissingValues

age_corruption = MissingValues(column='age', fraction=0.8, na_value=-1.0)

corrupted_test_data = age_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.871314973781681'

Cleaning via mean imputation

In [7]:
mean_age = np.mean(task.train_data.age.values)
mean_age

38.61793611793612

In [8]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data.loc[clean_test_data['age'] < 0, 'age'] = mean_age
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
5595,Private,Craft-repair,Married-civ-spouse,HS-grad,45,38.617936
29818,Private,Other-service,Divorced,HS-grad,40,38.617936
480,Private,Craft-repair,Married-civ-spouse,Assoc-acdm,45,38.617936
28532,Private,Machine-op-inspct,Married-civ-spouse,11th,40,52.000000
32419,Private,Exec-managerial,Married-civ-spouse,Bachelors,55,36.000000
...,...,...,...,...,...,...
7854,Self-emp-not-inc,Farming-fishing,Never-married,Some-college,50,38.617936
7946,Private,Machine-op-inspct,Divorced,Some-college,40,38.617936
12168,Federal-gov,Handlers-cleaners,Divorced,Some-college,40,38.617936
30644,Private,Other-service,Never-married,Assoc-acdm,30,38.617936


In [9]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8818042547827303'

### Missing values in 'marital_status'

In [10]:
from jenga.corruptions.generic import MissingValues

marital_status_corruption = MissingValues(column='marital_status', fraction=0.8, na_value=np.nan)


In [11]:
corrupted_test_data = marital_status_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8334395757145934'

In [12]:
corrupted_test_data 

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
5595,Private,Craft-repair,,HS-grad,45,60
29818,Private,Other-service,Divorced,HS-grad,40,24
480,Private,Craft-repair,,Assoc-acdm,45,33
28532,Private,Machine-op-inspct,,11th,40,52
32419,Private,Exec-managerial,Married-civ-spouse,Bachelors,55,36
...,...,...,...,...,...,...
7854,Self-emp-not-inc,Farming-fishing,,Some-college,50,25
7946,Private,Machine-op-inspct,Divorced,Some-college,40,29
12168,Federal-gov,Handlers-cleaners,,Some-college,40,38
30644,Private,Other-service,,Assoc-acdm,30,37


Cleaning via mode imputation

In [13]:
task.train_data['marital_status'].value_counts()

Married-civ-spouse       12038
Never-married             8497
Divorced                  3542
Widowed                    807
Separated                  806
Married-spouse-absent      338
Married-AF-spouse           20
Name: marital_status, dtype: int64

In [14]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data['marital_status'].fillna('Married-civ-spouse', inplace=True)
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
5595,Private,Craft-repair,Married-civ-spouse,HS-grad,45,60
29818,Private,Other-service,Divorced,HS-grad,40,24
480,Private,Craft-repair,Married-civ-spouse,Assoc-acdm,45,33
28532,Private,Machine-op-inspct,Married-civ-spouse,11th,40,52
32419,Private,Exec-managerial,Married-civ-spouse,Bachelors,55,36
...,...,...,...,...,...,...
7854,Self-emp-not-inc,Farming-fishing,Married-civ-spouse,Some-college,50,25
7946,Private,Machine-op-inspct,Divorced,Some-college,40,29
12168,Federal-gov,Handlers-cleaners,Married-civ-spouse,Some-college,40,38
30644,Private,Other-service,Married-civ-spouse,Assoc-acdm,30,37


In [15]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8276910272741881'