In [3]:
import numpy as np

In [5]:
from jenga.tasks.income import IncomeEstimationTask

task = IncomeEstimationTask(seed=42)

In [6]:
model = task.fit_baseline_model(task.train_data, task.train_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.8s finished


In [7]:
y_pred = model.predict_proba(task.test_data)

f"The ROC AUC score on the test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the test data is 0.8816049110483648'

In [8]:
task.test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
14160,Private,Adm-clerical,Divorced,Some-college,38,27
27048,State-gov,Exec-managerial,Married-civ-spouse,HS-grad,40,45
28868,Private,Exec-managerial,Married-civ-spouse,Bachelors,55,29
5667,Private,Machine-op-inspct,Never-married,Bachelors,40,30
7827,Self-emp-not-inc,Craft-repair,Divorced,Some-college,50,29
...,...,...,...,...,...,...
1338,Private,Tech-support,Divorced,Bachelors,16,71
24534,Local-gov,Prof-specialty,Married-civ-spouse,Some-college,40,55
18080,Private,Prof-specialty,Married-civ-spouse,Prof-school,48,47
10354,Private,Adm-clerical,Never-married,Bachelors,40,27


### Missing values in the 'age' column

In [47]:
from jenga.corruptions.generic import MissingValues

age_corruption = MissingValues(column='age', fraction=0.8, na_value=-1.0)

corrupted_test_data = age_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8667148083909569'

Cleaning via mean imputation

In [54]:
mean_age = np.mean(task.train_data.age.values)
mean_age

38.57651259213759

In [55]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data.loc[clean_test_data['age'] < 0, 'age'] = mean_age
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
14160,Private,Adm-clerical,Divorced,Some-college,38,27.000000
27048,State-gov,Exec-managerial,Married-civ-spouse,HS-grad,40,38.576513
28868,Private,Exec-managerial,Married-civ-spouse,Bachelors,55,38.576513
5667,Private,Machine-op-inspct,Never-married,Bachelors,40,38.576513
7827,Self-emp-not-inc,Craft-repair,Divorced,Some-college,50,38.576513
...,...,...,...,...,...,...
1338,Private,Tech-support,Divorced,Bachelors,16,71.000000
24534,Local-gov,Prof-specialty,Married-civ-spouse,Some-college,40,55.000000
18080,Private,Prof-specialty,Married-civ-spouse,Prof-school,48,38.576513
10354,Private,Adm-clerical,Never-married,Bachelors,40,38.576513


In [52]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8778996769914844'

### Missing values in 'marital_status'

In [56]:
from jenga.corruptions.generic import MissingValues

marital_status_corruption = MissingValues(column='marital_status', fraction=0.8, na_value=np.nan)


In [57]:
corrupted_test_data = marital_status_corruption.transform(task.test_data)
y_pred = model.predict_proba(corrupted_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred)}"

'The ROC AUC score on the corrupted test data is 0.8229758643936114'

In [58]:
corrupted_test_data 

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
14160,Private,Adm-clerical,,Some-college,38,27
27048,State-gov,Exec-managerial,,HS-grad,40,45
28868,Private,Exec-managerial,,Bachelors,55,29
5667,Private,Machine-op-inspct,,Bachelors,40,30
7827,Self-emp-not-inc,Craft-repair,,Some-college,50,29
...,...,...,...,...,...,...
1338,Private,Tech-support,,Bachelors,16,71
24534,Local-gov,Prof-specialty,,Some-college,40,55
18080,Private,Prof-specialty,,Prof-school,48,47
10354,Private,Adm-clerical,,Bachelors,40,27


Cleaning via mode imputation

In [59]:
task.train_data['marital_status'].value_counts()

Married-civ-spouse       12026
Never-married             8557
Divorced                  3523
Separated                  816
Widowed                    785
Married-spouse-absent      322
Married-AF-spouse           19
Name: marital_status, dtype: int64

In [60]:
clean_test_data = corrupted_test_data.copy(deep=True)
clean_test_data['marital_status'].fillna('Married-civ-spouse', inplace=True)
clean_test_data

Unnamed: 0,workclass,occupation,marital_status,education,hours_per_week,age
14160,Private,Adm-clerical,Married-civ-spouse,Some-college,38,27
27048,State-gov,Exec-managerial,Married-civ-spouse,HS-grad,40,45
28868,Private,Exec-managerial,Married-civ-spouse,Bachelors,55,29
5667,Private,Machine-op-inspct,Married-civ-spouse,Bachelors,40,30
7827,Self-emp-not-inc,Craft-repair,Married-civ-spouse,Some-college,50,29
...,...,...,...,...,...,...
1338,Private,Tech-support,Married-civ-spouse,Bachelors,16,71
24534,Local-gov,Prof-specialty,Married-civ-spouse,Some-college,40,55
18080,Private,Prof-specialty,Married-civ-spouse,Prof-school,48,47
10354,Private,Adm-clerical,Married-civ-spouse,Bachelors,40,27


In [61]:
y_pred_cleaned = model.predict_proba(clean_test_data)

f"The ROC AUC score on the corrupted test data is {task.score_on_test_data(y_pred_cleaned)}"

'The ROC AUC score on the corrupted test data is 0.8150534745376089'