In [1]:
import numpy as np
import itertools
import random
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd

from jenga.cleaning.ppp import PipelineWithPPP
from jenga.cleaning.autoclean import AutoClean
from jenga.cleaning.outlier_removal import SKLearnIsolationForest
from jenga.cleaning.imputation import SimpleImputation

from jenga.corruptions.numerical import SwappedValues, Outliers, Scaling
from jenga.corruptions.text import BrokenCharacters
from jenga.corruptions.missing import ( MissingValuesHighEntropy, 
                                  MissingValuesLowEntropy, 
                                  MissingValues
                                )

from jenga.corruptions.numerical import SwappedValues, Outliers, Scaling

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_features', numeric_transformer, numeric_features),
        ('categorical_features', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.790


In [2]:
from jenga.cleaning.ppp import PipelineWithPPP
ppp = PipelineWithPPP(clf, 
                      numerical_columns=numeric_features, 
                      categorical_columns=categorical_features,
                      num_repetitions=5,
                      perturbation_fractions=[.3, .5, .6, .75, .9]
                     )
ppp.fit_ppp(X_train, y_train)
print(f'Predicted score: {ppp.predict_ppp(X_test):.4f}, true score {clf.score(X_test, y_test):.4f}')

Generating perturbed training data on 1047 rows ...
	... perturbation 0/300: swapped, col ('age', 'fare'), fraction: 0.3
	... perturbation 1/300: swapped, col ('embarked', 'pclass'), fraction: 0.3
	... perturbation 2/300: scaling, col ['age'], fraction: 0.3
	... perturbation 3/300: outlier, col ['age'], fraction: 0.3
	... perturbation 4/300: missing_MCAR, col age, fraction: 0.3
	... perturbation 5/300: missing_MAR, col age, fraction: 0.3
	... perturbation 6/300: missing_MNAR, col age, fraction: 0.3
	... perturbation 7/300: missing_MCAR, col sex, fraction: 0.3
	... perturbation 8/300: missing_MAR, col sex, fraction: 0.3
	... perturbation 9/300: missing_MNAR, col sex, fraction: 0.3
	... perturbation 10/300: missing_high_entropy, col ['pclass'], fraction: 0.3
	... perturbation 11/300: missing_low_entropy, col ['sex'], fraction: 0.3
	... perturbation 12/300: swapped, col ('age', 'fare'), fraction: 0.5
	... perturbation 13/300: swapped, col ('embarked', 'pclass'), fraction: 0.5
	... perturb

	... perturbation 122/300: scaling, col ['age'], fraction: 0.3
	... perturbation 123/300: outlier, col ['age'], fraction: 0.3
	... perturbation 124/300: missing_MCAR, col age, fraction: 0.3
	... perturbation 125/300: missing_MAR, col age, fraction: 0.3
	... perturbation 126/300: missing_MNAR, col age, fraction: 0.3
	... perturbation 127/300: missing_MCAR, col embarked, fraction: 0.3
	... perturbation 128/300: missing_MAR, col embarked, fraction: 0.3
	... perturbation 129/300: missing_MNAR, col embarked, fraction: 0.3
	... perturbation 130/300: missing_high_entropy, col ['pclass'], fraction: 0.3
	... perturbation 131/300: missing_low_entropy, col ['embarked'], fraction: 0.3
	... perturbation 132/300: swapped, col ('age', 'fare'), fraction: 0.5
	... perturbation 133/300: swapped, col ('sex', 'pclass'), fraction: 0.5
	... perturbation 134/300: scaling, col ['fare'], fraction: 0.5
	... perturbation 135/300: outlier, col ['fare'], fraction: 0.5
	... perturbation 136/300: missing_MCAR, col f

	... perturbation 249/300: missing_MNAR, col embarked, fraction: 0.3
	... perturbation 250/300: missing_high_entropy, col ['pclass'], fraction: 0.3
	... perturbation 251/300: missing_low_entropy, col ['pclass'], fraction: 0.3
	... perturbation 252/300: swapped, col ('age', 'fare'), fraction: 0.5
	... perturbation 253/300: swapped, col ('embarked', 'pclass'), fraction: 0.5
	... perturbation 254/300: scaling, col ['fare'], fraction: 0.5
	... perturbation 255/300: outlier, col ['fare'], fraction: 0.5
	... perturbation 256/300: missing_MCAR, col fare, fraction: 0.5
	... perturbation 257/300: missing_MAR, col fare, fraction: 0.5
	... perturbation 258/300: missing_MNAR, col fare, fraction: 0.5
	... perturbation 259/300: missing_MCAR, col pclass, fraction: 0.5
	... perturbation 260/300: missing_MAR, col pclass, fraction: 0.5
	... perturbation 261/300: missing_MNAR, col pclass, fraction: 0.5
	... perturbation 262/300: missing_high_entropy, col ['sex'], fraction: 0.5
	... perturbation 263/300: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [7]:
perturbation = Outliers(0.9, ['age','fare'])
X_corrupted = perturbation(X_test.copy())

ac = AutoClean(X_train, 
               y_train, 
               clf, 
               numerical_columns=['age','fare'], 
               categorical_columns=['embarked', 'sex', 'pclass'],
               outlier_removal=[SKLearnIsolationForest],
               imputation=[SimpleImputation]
              )
X_cleaned, predicted_score, cleaner_results = ac(X_corrupted.copy(deep=True))

result = {
        'perturbation': perturbation,
        'clean_data_test_score': clf.score(X_test, y_test),
        'corrupted_data_test_score': clf.score(X_corrupted, y_test),
        'cleaned_data_test_score': clf.score(X_cleaned, y_test),
        'ppp_score_corrupted': ppp.predict_ppp(X_corrupted),
        'ppp_score_cleaned': ppp.predict_ppp(X_cleaned)
    }
print('\n'.join([f'{k}:{v}' for k,v in result.items()]))
    

Generating perturbed training data on 1047 rows ...
	... perturbation 0/72: swapped, col ('age', 'fare'), fraction: 0.5
	... perturbation 1/72: swapped, col ('sex', 'pclass'), fraction: 0.5
	... perturbation 2/72: scaling, col ['age'], fraction: 0.5
	... perturbation 3/72: outlier, col ['age'], fraction: 0.5
	... perturbation 4/72: missing_MCAR, col age, fraction: 0.5
	... perturbation 5/72: missing_MAR, col age, fraction: 0.5
	... perturbation 6/72: missing_MNAR, col age, fraction: 0.5
	... perturbation 7/72: missing_MCAR, col sex, fraction: 0.5
	... perturbation 8/72: missing_MAR, col sex, fraction: 0.5
	... perturbation 9/72: missing_MNAR, col sex, fraction: 0.5
	... perturbation 10/72: missing_high_entropy, col ['pclass'], fraction: 0.5
	... perturbation 11/72: missing_low_entropy, col ['pclass'], fraction: 0.5
	... perturbation 12/72: swapped, col ('age', 'fare'), fraction: 0.7
	... perturbation 13/72: swapped, col ('embarked', 'pclass'), fraction: 0.7
	... perturbation 14/72: sca

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  best_cleaning_idx = pd.Series(cleaner_results).argmax()


Best cleaning type + type: 0.758802929003502
perturbation:<jenga.corruptions.numerical.Outliers object at 0x135a44ac8>
clean_data_test_score:0.7862595419847328
corrupted_data_test_score:0.7366412213740458
cleaned_data_test_score:0.6641221374045801
ppp_score_corrupted:0.7379815345431392
ppp_score_cleaned:0.7194524036930914


In [4]:
debug

> [0;32m/Users/felix/anaconda3/envs/ppp/lib/python3.6/site-packages/sklearn/utils/validation.py[0m(556)[0;36mcheck_array[0;34m()[0m
[0;32m    554 [0;31m                    [0;34m"Reshape your data either using array.reshape(-1, 1) if "[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    555 [0;31m                    [0;34m"your data has a single feature or array.reshape(1, -1) "[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 556 [0;31m                    "if it contains a single sample.".format(array))
[0m[0;32m    557 [0;31m[0;34m[0m[0m
[0m[0;32m    558 [0;31m        [0;31m# in the future np.flexible dtypes will be handled like object dtypes[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> X_cleaned
*** NameError: name 'X_cleaned' is not defined
ipdb> up
> [0;32m/Users/felix/anaconda3/envs/ppp/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py[0m(630)[0;36m_check_X[0;34m()[0m
[0;32m    628 [0;31m    [0;32mif[0m [0mhasattr[0m[0;34m([0m[0mX[