In [1]:
import numpy as np
import itertools
import random
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_features', numeric_transformer, numeric_features),
        ('categorical_features', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.790


In [2]:
from jenga.cleaning.ppp import PipelineWithPPP
ppp = PipelineWithPPP(clf, 
                      numerical_columns=numeric_features, 
                      categorical_columns=categorical_features,
                      num_repetitions=10,
                      perturbation_fractions=[.1, .2, .3, .4, .5, .75, .9]
                     )
ppp.fit_ppp(X_train, y_train)
print(f'Predicted score: {ppp.predict_ppp(X_test):.4f}, true score {clf.score(X_test, y_test):.4f}')

Generating perturbed training data on 1047 rows ...
	... perturbation 0/840: swapped, col ('age', 'fare'), fraction: 0.1
	... perturbation 1/840: swapped, col ('sex', 'pclass'), fraction: 0.1
	... perturbation 2/840: scaling, col ['fare'], fraction: 0.1
	... perturbation 3/840: outlier, col ['fare'], fraction: 0.1
	... perturbation 4/840: missing_MCAR, col fare, fraction: 0.1
	... perturbation 5/840: missing_MAR, col fare, fraction: 0.1
	... perturbation 6/840: missing_MNAR, col fare, fraction: 0.1
	... perturbation 7/840: missing_MCAR, col sex, fraction: 0.1
	... perturbation 8/840: missing_MAR, col sex, fraction: 0.1
	... perturbation 9/840: missing_MNAR, col sex, fraction: 0.1
	... perturbation 10/840: missing_high_entropy, col ['pclass'], fraction: 0.1
	... perturbation 11/840: missing_low_entropy, col ['embarked'], fraction: 0.1
	... perturbation 12/840: swapped, col ('age', 'fare'), fraction: 0.2
	... perturbation 13/840: swapped, col ('sex', 'pclass'), fraction: 0.2
	... perturb

	... perturbation 126/840: missing_MNAR, col age, fraction: 0.4
	... perturbation 127/840: missing_MCAR, col pclass, fraction: 0.4
	... perturbation 128/840: missing_MAR, col pclass, fraction: 0.4
	... perturbation 129/840: missing_MNAR, col pclass, fraction: 0.4
	... perturbation 130/840: missing_high_entropy, col ['embarked'], fraction: 0.4
	... perturbation 131/840: missing_low_entropy, col ['embarked'], fraction: 0.4
	... perturbation 132/840: swapped, col ('age', 'fare'), fraction: 0.5
	... perturbation 133/840: swapped, col ('embarked', 'pclass'), fraction: 0.5
	... perturbation 134/840: scaling, col ['fare'], fraction: 0.5
	... perturbation 135/840: outlier, col ['fare'], fraction: 0.5
	... perturbation 136/840: missing_MCAR, col fare, fraction: 0.5
	... perturbation 137/840: missing_MAR, col fare, fraction: 0.5
	... perturbation 138/840: missing_MNAR, col fare, fraction: 0.5
	... perturbation 139/840: missing_MCAR, col sex, fraction: 0.5
	... perturbation 140/840: missing_MAR, 

	... perturbation 246/840: missing_MNAR, col fare, fraction: 0.9
	... perturbation 247/840: missing_MCAR, col sex, fraction: 0.9
	... perturbation 248/840: missing_MAR, col sex, fraction: 0.9
	... perturbation 249/840: missing_MNAR, col sex, fraction: 0.9
	... perturbation 250/840: missing_high_entropy, col ['sex'], fraction: 0.9
	... perturbation 251/840: missing_low_entropy, col ['sex'], fraction: 0.9
	... perturbation 252/840: swapped, col ('age', 'fare'), fraction: 0.1
	... perturbation 253/840: swapped, col ('sex', 'pclass'), fraction: 0.1
	... perturbation 254/840: scaling, col ['age'], fraction: 0.1
	... perturbation 255/840: outlier, col ['age'], fraction: 0.1
	... perturbation 256/840: missing_MCAR, col age, fraction: 0.1
	... perturbation 257/840: missing_MAR, col age, fraction: 0.1
	... perturbation 258/840: missing_MNAR, col age, fraction: 0.1
	... perturbation 259/840: missing_MCAR, col pclass, fraction: 0.1
	... perturbation 260/840: missing_MAR, col pclass, fraction: 0.1

	... perturbation 374/840: scaling, col ['fare'], fraction: 0.4
	... perturbation 375/840: outlier, col ['fare'], fraction: 0.4
	... perturbation 376/840: missing_MCAR, col fare, fraction: 0.4
	... perturbation 377/840: missing_MAR, col fare, fraction: 0.4
	... perturbation 378/840: missing_MNAR, col fare, fraction: 0.4
	... perturbation 379/840: missing_MCAR, col embarked, fraction: 0.4
	... perturbation 380/840: missing_MAR, col embarked, fraction: 0.4
	... perturbation 381/840: missing_MNAR, col embarked, fraction: 0.4
	... perturbation 382/840: missing_high_entropy, col ['sex'], fraction: 0.4
	... perturbation 383/840: missing_low_entropy, col ['pclass'], fraction: 0.4
	... perturbation 384/840: swapped, col ('age', 'fare'), fraction: 0.5
	... perturbation 385/840: swapped, col ('embarked', 'pclass'), fraction: 0.5
	... perturbation 386/840: scaling, col ['age'], fraction: 0.5
	... perturbation 387/840: outlier, col ['age'], fraction: 0.5
	... perturbation 388/840: missing_MCAR, co

	... perturbation 507/840: outlier, col ['age'], fraction: 0.1
	... perturbation 508/840: missing_MCAR, col age, fraction: 0.1
	... perturbation 509/840: missing_MAR, col age, fraction: 0.1
	... perturbation 510/840: missing_MNAR, col age, fraction: 0.1
	... perturbation 511/840: missing_MCAR, col embarked, fraction: 0.1
	... perturbation 512/840: missing_MAR, col embarked, fraction: 0.1
	... perturbation 513/840: missing_MNAR, col embarked, fraction: 0.1
	... perturbation 514/840: missing_high_entropy, col ['pclass'], fraction: 0.1
	... perturbation 515/840: missing_low_entropy, col ['sex'], fraction: 0.1
	... perturbation 516/840: swapped, col ('age', 'fare'), fraction: 0.2
	... perturbation 517/840: swapped, col ('sex', 'pclass'), fraction: 0.2
	... perturbation 518/840: scaling, col ['age'], fraction: 0.2
	... perturbation 519/840: outlier, col ['age'], fraction: 0.2
	... perturbation 520/840: missing_MCAR, col age, fraction: 0.2
	... perturbation 521/840: missing_MAR, col age, fra

	... perturbation 640/840: missing_MCAR, col fare, fraction: 0.5
	... perturbation 641/840: missing_MAR, col fare, fraction: 0.5
	... perturbation 642/840: missing_MNAR, col fare, fraction: 0.5
	... perturbation 643/840: missing_MCAR, col pclass, fraction: 0.5
	... perturbation 644/840: missing_MAR, col pclass, fraction: 0.5
	... perturbation 645/840: missing_MNAR, col pclass, fraction: 0.5
	... perturbation 646/840: missing_high_entropy, col ['embarked'], fraction: 0.5
	... perturbation 647/840: missing_low_entropy, col ['pclass'], fraction: 0.5
	... perturbation 648/840: swapped, col ('age', 'fare'), fraction: 0.75
	... perturbation 649/840: swapped, col ('embarked', 'pclass'), fraction: 0.75
	... perturbation 650/840: scaling, col ['fare'], fraction: 0.75
	... perturbation 651/840: outlier, col ['fare'], fraction: 0.75
	... perturbation 652/840: missing_MCAR, col fare, fraction: 0.75
	... perturbation 653/840: missing_MAR, col fare, fraction: 0.75
	... perturbation 654/840: missing_

	... perturbation 771/840: outlier, col ['fare'], fraction: 0.2
	... perturbation 772/840: missing_MCAR, col fare, fraction: 0.2
	... perturbation 773/840: missing_MAR, col fare, fraction: 0.2
	... perturbation 774/840: missing_MNAR, col fare, fraction: 0.2
	... perturbation 775/840: missing_MCAR, col pclass, fraction: 0.2
	... perturbation 776/840: missing_MAR, col pclass, fraction: 0.2
	... perturbation 777/840: missing_MNAR, col pclass, fraction: 0.2
	... perturbation 778/840: missing_high_entropy, col ['sex'], fraction: 0.2
	... perturbation 779/840: missing_low_entropy, col ['pclass'], fraction: 0.2
	... perturbation 780/840: swapped, col ('age', 'fare'), fraction: 0.3
	... perturbation 781/840: swapped, col ('sex', 'pclass'), fraction: 0.3
	... perturbation 782/840: scaling, col ['fare'], fraction: 0.3
	... perturbation 783/840: outlier, col ['fare'], fraction: 0.3
	... perturbation 784/840: missing_MCAR, col fare, fraction: 0.3
	... perturbation 785/840: missing_MAR, col fare, f

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [5]:
from jenga.corruptions.numerical import SwappedValues, Outliers, Scaling
X_swapped = SwappedValues(0.5, ['age','fare'])(X_test.copy())

ppp_prediction, corrupted_idx = ppp.predict_and_explain_ppp(X_swapped,
                            num_percentile_neighbors = 5,
                            num_top_meta_features = 3)

print(f'Clean Score: {clf.score(X_test, y_test)}, Predicted score: {ppp.predict_ppp(X_swapped):.4f}, true score {clf.score(X_swapped, y_test):.4f}')
X_swapped.iloc[corrupted_idx]

Clean Score: 0.7862595419847328, Predicted score: 0.7653, true score 0.7557


Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
27,1.0,"Bishop, Mrs. Dickinson H (Helen Walton)",female,91.0792,1.0,0.0,11967,19.0,B49,C,7,,"Dowagiac, MI"
1050,3.0,"Nakid, Mrs. Said (Waika 'Mary' Mowad)",female,15.7417,1.0,1.0,2653,19.0,,C,C,,
381,2.0,"Corbett, Mrs. Walter H (Irene Colvin)",female,30.0,0.0,0.0,237249,13.0,,S,,,"Provo, UT"
1047,3.0,"Najib, Miss. Adele Kiamie 'Jane'",female,15.0,0.0,0.0,2667,7.225,,C,C,,
1305,3.0,"Zabour, Miss. Thamine",female,14.4542,1.0,0.0,2665,0.0,,C,,,
92,1.0,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,57.0,1.0,0.0,17474,17.0,B20,S,3,,"Calgary, AB"
541,2.0,"Quick, Miss. Winifred Vera",female,26.0,1.0,1.0,26360,8.0,,S,11,,"Plymouth, Devon / Detroit, MI"
557,2.0,"Shelley, Mrs. William (Imanita Parrish Hall)",female,26.0,0.0,1.0,230433,25.0,,S,12,,"Deer Lodge, MT"
436,2.0,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,26.25,1.0,1.0,F.C.C. 13529,45.0,,S,14,,"Ilford, Essex / Winnipeg, MB"
349,2.0,"Brown, Miss. Amelia 'Mildred'",female,24.0,0.0,0.0,248733,13.0,F33,S,11,,"London / Montreal, PQ"


In [8]:
from jenga.corruptions.numerical import SwappedValues, Outliers, Scaling

X_outliers = Outliers(0.6, ['age'])(X_test.copy())

ppp_prediction, corrupted_idx = ppp.predict_and_explain_ppp(X_outliers,
                            num_percentile_neighbors = 10,
                            num_top_meta_features = 1)

print(f'Clean Score: {clf.score(X_test, y_test)}, Predicted score: {ppp.predict_ppp(X_outliers):.4f}, true score {clf.score(X_outliers, y_test):.4f}')

X_outliers.iloc[corrupted_idx]

Clean Score: 0.7862595419847328, Predicted score: 0.7707, true score 0.7634


Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1012,3.0,"McNeill, Miss. Bridget",female,0.0,0.0,0.0,370368,7.75,,Q,,,
961,3.0,"Lennon, Miss. Mary",female,0.0,1.0,0.0,370371,15.5,,Q,,,
1028,3.0,"Moran, Miss. Bertha",female,0.0,1.0,0.0,371110,24.15,,Q,16,,
1149,3.0,"Riordan, Miss. Johanna 'Hannah'",female,0.213428,0.0,0.0,334915,7.7208,,Q,13,,
1050,3.0,"Nakid, Mrs. Said (Waika 'Mary' Mowad)",female,19.0,1.0,1.0,2653,15.7417,,C,C,,
425,2.0,"Greenberg, Mr. Samuel",male,-65.453213,0.0,0.0,250647,13.0,,S,,19.0,"Bronx, NY"
1240,3.0,"Thomas, Master. Assad Alexander",male,-80.418642,0.0,1.0,2625,8.5167,,C,16,,
482,2.0,"Lehmann, Miss. Bertha",female,60.52681,0.0,0.0,SC 1748,12.0,,C,12,,"Berne, Switzerland / Central City, IA"
47,1.0,"Calderhead, Mr. Edward Pennington",male,-33.77,0.0,0.0,PC 17476,26.2875,E24,S,5,,"New York, NY"
361,2.0,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harb...",female,35.897021,1.0,1.0,248738,29.0,,S,13,,"Bangkok, Thailand / Roseville, IL"
