In [1]:
import numpy as np

In [2]:
from jenga.tasks.income import IncomeEstimationTask

task = IncomeEstimationTask(seed=42)

In [3]:
model = task.fit_baseline_model(task.train_data, task.train_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   14.2s finished


In [4]:
y_pred = model.predict_proba(task.test_data)

f"The ROC AUC score on the test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the test data is 0.8808628129208451'

In [5]:
task.test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
30462,Private,Craft-repair,Married-civ-spouse,Assoc-voc,57,51
32069,State-gov,Exec-managerial,Married-civ-spouse,HS-grad,40,36
30287,Private,Craft-repair,Married-civ-spouse,HS-grad,40,58
14177,Private,Handlers-cleaners,Never-married,12th,20,19
24751,Private,Machine-op-inspct,Never-married,HS-grad,40,39
...,...,...,...,...,...,...
7505,Private,Adm-clerical,Never-married,Assoc-acdm,40,53
9846,Self-emp-not-inc,Transport-moving,Never-married,Some-college,30,43
6301,Self-emp-not-inc,Craft-repair,Married-civ-spouse,HS-grad,40,30
28860,Private,Transport-moving,Never-married,HS-grad,45,28


### Missing values in the 'age' column

In [6]:
from jenga.corruptions.generic import MissingValues

age_corruption = MissingValues(column='age', fraction=0.8, na_value=-1.0)

corrupted_test_data = age_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.870056537714585'

Cleaning via mean imputation

In [7]:
mean_age = np.mean(task.train_data.age.values)
mean_age

38.56476504914005

In [8]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data.loc[clean_test_data['age'] < 0, 'age'] = mean_age
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
30462,Private,Craft-repair,Married-civ-spouse,Assoc-voc,57,38.564765
32069,State-gov,Exec-managerial,Married-civ-spouse,HS-grad,40,38.564765
30287,Private,Craft-repair,Married-civ-spouse,HS-grad,40,38.564765
14177,Private,Handlers-cleaners,Never-married,12th,20,19.000000
24751,Private,Machine-op-inspct,Never-married,HS-grad,40,39.000000
...,...,...,...,...,...,...
7505,Private,Adm-clerical,Never-married,Assoc-acdm,40,38.564765
9846,Self-emp-not-inc,Transport-moving,Never-married,Some-college,30,38.564765
6301,Self-emp-not-inc,Craft-repair,Married-civ-spouse,HS-grad,40,38.564765
28860,Private,Transport-moving,Never-married,HS-grad,45,38.564765


In [9]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8776545471725048'

### Missing values in 'marital_status'

In [10]:
from jenga.corruptions.generic import MissingValues

marital_status_corruption = MissingValues(column='marital_status', fraction=0.8, na_value=np.nan)

In [11]:
corrupted_test_data = marital_status_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8188891313171542'

In [12]:
corrupted_test_data 

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
30462,Private,Craft-repair,,Assoc-voc,57,51
32069,State-gov,Exec-managerial,Married-civ-spouse,HS-grad,40,36
30287,Private,Craft-repair,,HS-grad,40,58
14177,Private,Handlers-cleaners,,12th,20,19
24751,Private,Machine-op-inspct,Never-married,HS-grad,40,39
...,...,...,...,...,...,...
7505,Private,Adm-clerical,,Assoc-acdm,40,53
9846,Self-emp-not-inc,Transport-moving,Never-married,Some-college,30,43
6301,Self-emp-not-inc,Craft-repair,,HS-grad,40,30
28860,Private,Transport-moving,,HS-grad,45,28


Cleaning via mode imputation

In [13]:
task.train_data['marital_status'].value_counts()

Married-civ-spouse       11952
Never-married             8585
Divorced                  3544
Separated                  839
Widowed                    775
Married-spouse-absent      335
Married-AF-spouse           18
Name: marital_status, dtype: int64

In [14]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data['marital_status'].fillna('Married-civ-spouse', inplace=True)
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
30462,Private,Craft-repair,Married-civ-spouse,Assoc-voc,57,51
32069,State-gov,Exec-managerial,Married-civ-spouse,HS-grad,40,36
30287,Private,Craft-repair,Married-civ-spouse,HS-grad,40,58
14177,Private,Handlers-cleaners,Married-civ-spouse,12th,20,19
24751,Private,Machine-op-inspct,Never-married,HS-grad,40,39
...,...,...,...,...,...,...
7505,Private,Adm-clerical,Married-civ-spouse,Assoc-acdm,40,53
9846,Self-emp-not-inc,Transport-moving,Never-married,Some-college,30,43
6301,Self-emp-not-inc,Craft-repair,Married-civ-spouse,HS-grad,40,30
28860,Private,Transport-moving,Married-civ-spouse,HS-grad,45,28


In [15]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.810777852679363'