In [1]:
import numpy as np
import itertools
import random
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_features', numeric_transformer, numeric_features),
        ('categorical_features', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.790


In [2]:
import pandas as pd
import numpy as np
import random
import itertools
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

class SwappedValues:

    def __init__(self, fraction, column_pair):
        self.fraction = fraction
        self.column_pair = column_pair

    def __call__(self, clean_df):
        # we operate on a copy of the data
        df = clean_df.copy(deep=True)

        (column_a, column_b) = self.column_pair

        values_of_column_a = list(df[column_a])
        values_of_column_b = list(df[column_b])

        for index in range(0, len(values_of_column_a)):
            if random.random() < self.fraction:
                temp_value = values_of_column_a[index]
                values_of_column_a[index] = values_of_column_b[index]
                values_of_column_b[index] = temp_value

        df[column_a] = values_of_column_a
        df[column_b] = values_of_column_b

        return df

class Outliers:

    def __init__(self, fraction, columns):
        self.fraction = fraction
        self.columns = columns

    def __call__(self, clean_df):
        # we operate on a copy of the data
        df = clean_df.copy(deep=True)
        # means = {column: np.mean(df[column]) for column in self.columns}
        stddevs = {column: np.std(df[column]) for column in self.columns}
        scales = {column: random.uniform(1, 5) for column in self.columns}

        if self.fraction > 0:
            for column in self.columns:
                rows = np.random.uniform(size=len(df))<self.fraction
                noise = np.random.normal(0, scales[column] * stddevs[column], size=rows.sum())
                df.loc[rows, column] += noise

        return df


class Scaling:

    def __init__(self, fraction, columns):
        self.fraction = fraction
        self.columns = columns

    def __call__(self, clean_df):
        # we operate on a copy of the data
        df = clean_df.copy(deep=True)

        scale_factor = np.random.choice([10, 100, 1000])
        
        if self.fraction > 0:
            for column in self.columns:
                rows = np.random.uniform(size=len(df))<self.fraction
                df.loc[rows, column] *= scale_factor

        return df

class MissingValuesHighEntropy:

    def __init__(self, 
                    fraction, 
                    model, 
                    categorical_columns, 
                    numerical_columns,
                    categorical_value_to_put_in='NULL',
                    numerical_value_to_put_in=0):
        self.fraction = fraction
        self.model = model
        self.categorical_value_to_put_in = categorical_value_to_put_in
        self.numerical_value_to_put_in = numerical_value_to_put_in
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns

    def __call__(self, clean_df):
        # we operate on a copy of the data
        df = clean_df.copy(deep=True)
        df[self.categorical_columns] = df[self.categorical_columns].astype(str) 
        probas = self.model.predict_proba(df)
        # for samples with the smallest maximum probability 
        # the model is most uncertain
        cutoff = int(len(df) * (1-self.fraction))
        least_confident = probas.max(axis=1).argsort()[-cutoff:]
        for c in self.categorical_columns:
#             df.loc[df.index[least_confident], c].add_categories(self.categorical_value_to_put_in)
            df.loc[df.index[least_confident], c] = self.categorical_value_to_put_in
        for c in self.numerical_columns:
            df.loc[df.index[least_confident], c] = self.numerical_value_to_put_in

        return df

class MissingValuesLowEntropy:

    def __init__(self, 
                    fraction, 
                    model, 
                    categorical_columns, 
                    numerical_columns,
                    categorical_value_to_put_in='NULL',
                    numerical_value_to_put_in=0):
        self.fraction = fraction
        self.model = model
        self.categorical_value_to_put_in = categorical_value_to_put_in
        self.numerical_value_to_put_in = numerical_value_to_put_in
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns

    def __call__(self, clean_df):
        # we operate on a copy of the data
        df = clean_df.copy(deep=True)
        df[self.categorical_columns] = df[self.categorical_columns].astype(str) 
        probas = self.model.predict_proba(df)
        # for samples with the smallest maximum probability 
        # the model is most uncertain
        cutoff = int(len(df) * (1-self.fraction))
        most_confident = probas.max(axis=1).argsort()[:cutoff]
        for c in self.categorical_columns:
#             df.loc[df.index[least_confident], c].add_categories(self.categorical_value_to_put_in)
            df.loc[df.index[most_confident], c] = self.categorical_value_to_put_in
        for c in self.numerical_columns:
            df.loc[df.index[most_confident], c] = self.numerical_value_to_put_in

        return df

class PipelineWithPPP:

    def __init__(self, 
                pipeline, 
                num_repetitions=10, 
                perturbation_fractions=np.linspace(0,1,11)):
        self.pipeline = pipeline
        self.num_repetitions = num_repetitions
        self.perturbation_fractions = perturbation_fractions
        # assuming the first step is a ColumnTransformer with transformers named 
        # 'categorical_features' or 'numerical_features'
        self.categorical_features = []
        self.numerical_features = []
        for t in pipeline.steps[0][1].transformers:
            if t[0]=='categorical_features':
                self.categorical_features = t[2]
            if t[0]=='numerical_features':
                self.numerical_features = t[2]
        print(f'Registered categorical columns: {self.categorical_features}')
        print(f'Registered numerical columns: {self.numerical_features}')

        
        self.perturbations = []
        for _ in range(self.num_repetitions):
            for fraction in self.perturbation_fractions:
                
                numerical_column_pairs = list(itertools.combinations(self.numerical_features, 2))
                swap_affected_column_pair = random.choice(numerical_column_pairs)
                affected_numeric_column = random.choice(self.numerical_features)
                affected_categorical_column = np.random.choice(self.categorical_features)

                self.perturbations += [
                    ('swapped', SwappedValues(fraction, swap_affected_column_pair)),
                    ('scaling', Scaling(fraction, [affected_numeric_column])),
                    ('outlier', Outliers(fraction, [affected_numeric_column])),
                    ('missing_high_entropy', MissingValuesHighEntropy(fraction, pipeline, [affected_categorical_column], [affected_numeric_column])),
                    ('missing_low_entropy', MissingValuesLowEntropy(fraction, pipeline, [affected_categorical_column], [affected_numeric_column])),
                ]


    @staticmethod
    def compute_ppp_features(predictions):
        probs_class_a = np.transpose(predictions)[0]
        features_a = np.percentile(probs_class_a, np.arange(0, 101, 5))
        if predictions.shape[-1] > 1:
            probs_class_b = np.transpose(predictions)[1]
            features_b = np.percentile(probs_class_b, np.arange(0, 101, 5))
            return np.concatenate((features_a, features_b), axis=0)
        else:
            return features_a

    def fit_ppp(self, X_df, y):

        print("Generating perturbed training data...")
        meta_features = []
        meta_scores = []
        for perturbation in self.perturbations:
            df_perturbed = perturbation[1](X_df)
      
            predictions = self.pipeline.predict_proba(df_perturbed)
            
            meta_features.append(self.compute_ppp_features(predictions))
            meta_scores.append(self.pipeline.score(df_perturbed, y))
 
        param_grid = {
            'learner__n_estimators': np.arange(5, 20, 5),
            'learner__criterion': ['mae']
        }

        meta_regressor_pipeline = Pipeline([
           ('scaling', StandardScaler()),
           ('learner', RandomForestRegressor(criterion='mae'))
        ])

        print("Training performance predictor...")
        self.meta_regressor = GridSearchCV(
                                meta_regressor_pipeline, 
                                param_grid, 
                                scoring='neg_mean_absolute_error')\
                                    .fit(meta_features, meta_scores)

    def predict_ppp(self, X_df):
        meta_features = self.compute_ppp_features(self.pipeline.predict_proba(X_df))
        return self.meta_regressor.predict(meta_features.reshape(1, -1))[0]

In [5]:
# from ppp import PipelineWithPPP
ppp = PipelineWithPPP(clf, num_repetitions=100)
ppp.fit_ppp(X_train, y_train)
print(f'Predicted score: {ppp.predict_ppp(X_test):.4f}, true score {clf.score(X_test, y_test):.4f}')

Registered categorical columns: ['embarked', 'sex', 'pclass']
Registered numerical columns: ['age', 'fare']
Generating perturbed training data...
Training performance predictor...
Predicted score: 0.7844, true score 0.7901
