In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np

np.random.seed(1)

import sys
sys.path.append('..')

from ssc.hilda.perturbations import MissingValues

import warnings
warnings.simplefilter("ignore")

In [2]:
class auto_validating:
    def __init__(self, estimator, missing_values_in=[]):
        self.estimator = estimator
        self.missing_values_in = missing_values_in
        
    def __enter__(self):
        return self
    
    def __exit__(self, type, value, traceback):
        pass
    
    def __percentiles_of_probas(self, predictions):
        probs_class_a = np.transpose(predictions)[0]
        probs_class_b = np.transpose(predictions)[1]
        features_a = np.percentile(probs_class_a, np.arange(0, 101, 5))
        features_b = np.percentile(probs_class_b, np.arange(0, 101, 5))
        return np.concatenate((features_a, features_b), axis=0)
    
    def __gen_perturbations(self, err_gen, columns, dataset_size=500):
        # for fraction_of_values_to_delete in [0.01, 0.05, 0.25, 0.5, 0.75, 0.99]:
        for _ in range(dataset_size):
            yield err_gen(np.random.random(), columns, -1)
    
    def __train_meta_regressor(self, model, data, labels, perturbations):
        X, y = [], []
        for perturbation in perturbations:
            corrupted_data = perturbation.transform(data)
            
            # predictions = model.predict_proba(corrupted_test_data)
            # features = percentiles_of_probas(predictions)
            X.append(self.__percentiles_of_probas(model.predict_proba(corrupted_data)))
            
            # score_on_corrupted_test_data = learner.score(y_test, model.predict(corrupted_test_data))
            y.append(model.score(corrupted_data, labels))

        param_grid = {
            'learner__n_estimators': np.arange(5, 20, 5),
            'learner__criterion': ['mae']
        }

        meta_regressor_pipeline = Pipeline([
            ('scaling', StandardScaler()),
            ('learner', RandomForestRegressor(criterion='mae'))
        ])
        
        return (GridSearchCV(meta_regressor_pipeline, param_grid, scoring='neg_mean_absolute_error')
                .fit(np.array(X), np.array(y)))
    
    def fit(self, data, labels):
        print("intercepting training...")
        
        model = self.estimator.fit(data, labels)
        
        # TODO We need to train a validation model for missing values here
        
        self.meta_regressor = self.__train_meta_regressor(model, data, labels,
                                                          self.__gen_perturbations(MissingValues,
                                                                                   self.missing_values_in,
                                                                                   500))
        
        return PredictionInterceptor(model, self.meta_regressor)
        
    
class PredictionInterceptor:
    def __init__(self, transformer, meta_regressor):
        self.transformer = transformer
        self.meta_regressor = meta_regressor
        
    def __percentiles_of_probas(self, predictions):
        probs_class_a = np.transpose(predictions)[0]
        probs_class_b = np.transpose(predictions)[1]
        features_a = np.percentile(probs_class_a, np.arange(0, 101, 5))
        features_b = np.percentile(probs_class_b, np.arange(0, 101, 5))
        return np.concatenate((features_a, features_b), axis=0)
        
    def score(self, data, labels):
        print("Intercepting prediction...")
        
        # TODO We need to apply the validation model here
        
        threshold = .01
        features = self.__percentiles_of_probas(self.transformer.predict_proba(data))
        predicted_score = self.meta_regressor.predict(features.reshape(1, -1))
        real_score = self.transformer.score(data, labels)
        diff = np.abs(real_score - predicted_score)
        ratio = diff / real_score
        print(diff, ratio)
        if ratio > threshold:
            print("WARNING! Performance drop: %.4f > %.2f, scores deviate by %.4f" % (ratio, threshold, diff))
        
        return real_score
    

In [3]:
data = pd.read_csv('../resources/data/adult/adult.csv')

train_data, test_data = train_test_split(data)
y_train = np.array(train_data['class'] == '>50K')
y_test = np.array(test_data['class'] == '>50K')

    
feature_transformation = ColumnTransformer(transformers=[
    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['workclass', 'occupation', 'marital_status', 'education']),
    ('numeric', StandardScaler(), ['hours_per_week', 'age'])
])

pipeline = Pipeline([
    ('features', feature_transformation),
    ('learner', SGDClassifier(loss='log'))
])

with auto_validating(pipeline, missing_values_in=['education']) as validatable_pipeline:
    model = validatable_pipeline.fit(train_data, y_train)

model.score(test_data, y_test)

for missing_value_ratio in [.01, .05, .1, .2, .5, .7, .9]:
    print("Corrupting the test_data with %d%% of missing values" % (int(round(100*missing_value_ratio)),))
    corrupted_test_data = MissingValues(missing_value_ratio, ['education'], -1).transform(test_data)
    print(model.score(corrupted_test_data, y_test))
    print()

intercepting training...
Intercepting prediction...
[0.00118108] [0.00142215]
Corrupting the test_data with 1% of missing values
Intercepting prediction...
[0.00121112] [0.00145854]
0.8303648200466773

Corrupting the test_data with 5% of missing values
Intercepting prediction...
[0.0014842] [0.00178873]
0.8297506448839209

Corrupting the test_data with 10% of missing values
Intercepting prediction...
[0.00144622] [0.00174709]
0.8277852843631004

Corrupting the test_data with 20% of missing values
Intercepting prediction...
[0.00374755] [0.00452451]
0.8282766244933055

Corrupting the test_data with 50% of missing values
Intercepting prediction...
[0.00423431] [0.00515346]
0.8216435327355361

Corrupting the test_data with 70% of missing values
Intercepting prediction...
[0.00390987] [0.00477932]
0.8180813167915489

Corrupting the test_data with 90% of missing values
Intercepting prediction...
[0.00338064] [0.0041486]
0.8148876059452156



### Another version of the API, same-same but different

In [9]:
class Validatable:
    def __init__(self, estimator):
        self.estimator = estimator
        
    def check_on(self, data, labels):
        self.test_data = data
        self.test_labels = labels
        return self
        
    def check_for(self, missing_values_in=[]):
        self.missing_values_in = missing_values_in
        return self
        
    def __enter__(self):
        return self
    
    def __exit__(self, type, value, traceback):
        pass
    
    def __percentiles_of_probas(self, predictions):
        probs_class_a = np.transpose(predictions)[0]
        probs_class_b = np.transpose(predictions)[1]
        features_a = np.percentile(probs_class_a, np.arange(0, 101, 5))
        features_b = np.percentile(probs_class_b, np.arange(0, 101, 5))
        return np.concatenate((features_a, features_b), axis=0)
    
    def __gen_perturbations(self, err_gen, columns, dataset_size=500):
        # for fraction_of_values_to_delete in [0.01, 0.05, 0.25, 0.5, 0.75, 0.99]:
        for _ in range(dataset_size):
            yield err_gen(np.random.random(), columns, -1)
            
    def __train_meta_regressor(self, model, data, labels, perturbations):
        print("Generating corrupted data")
        X, y = [], []
        for perturbation in perturbations:
            corrupted_data = perturbation.transform(data)
            
            # predictions = model.predict_proba(corrupted_test_data)
            # features = percentiles_of_probas(predictions)
            X.append(self.__percentiles_of_probas(model.predict_proba(corrupted_data)))
            
            # score_on_corrupted_test_data = learner.score(y_test, model.predict(corrupted_test_data))
            y.append(model.score(corrupted_data, labels))

        param_grid = {
            'learner__n_estimators': np.arange(5, 20, 5),
            'learner__criterion': ['mae']
        }

        meta_regressor_pipeline = Pipeline([
            ('scaling', StandardScaler()),
            ('learner', RandomForestRegressor(criterion='mae'))
        ])
        
        print("Training the meta_regressor")
        return (GridSearchCV(meta_regressor_pipeline, param_grid, scoring='neg_mean_absolute_error')
                .fit(np.array(X), np.array(y)))
    
    def fit(self, data, labels):
        print("Training the model")
        # NB: encapsulate 'model' inside the class or not? 
        model = self.estimator.fit(data, labels)
        
        # TODO We need to train a validation model for missing values here
        self.meta_regressor = self.__train_meta_regressor(model, data, labels,
                                                          self.__gen_perturbations(MissingValues,
                                                                                   self.missing_values_in,
                                                                                   500))
        self.score(self.test_data, self.test_labels)
        
        # Returns a trained model, as a regular .fit method
        return model
    
    def score(self, data, labels):
        print("Validating the model")
        threshold = .01
        features = self.__percentiles_of_probas(model.predict_proba(data))
        predicted_score = self.meta_regressor.predict(features.reshape(1, -1))
        real_score = model.score(data, labels)
        diff = np.abs(real_score - predicted_score)
        ratio = diff / real_score
        print(diff, ratio)
        if ratio > threshold:
            print("WARNING! Performance drop: %.4f > %.2f, scores deviate by %.4f" % (ratio, threshold, diff))
        return real_score

In [11]:
with Validatable(pipeline) as validatable_pipeline:
    validatable_pipeline.check_on(test_data, y_test)
    validatable_pipeline.check_for(missing_values_in=['education', 'workclass'])
    
    model = validatable_pipeline.fit(train_data, y_train)
    
model.score(test_data, y_test)

for missing_value_ratio in [.01, .05, .1, .2, .5, .7, .9]:
    print("Corrupting the test_data with %d%% of missing values" % (int(round(100*missing_value_ratio)),))
    corrupted_test_data = MissingValues(missing_value_ratio, ['education', 'workclass'], -1).transform(test_data)
    print(validatable_pipeline.score(corrupted_test_data, y_test))
    print()

Training the model
Generating corrupted data
Training the meta_regressor
Validating the model
[0.01071138] [0.01288055]
Corrupting the test_data with 1% of missing values
Validating the model
[0.01077417] [0.01295606]
0.8315931703721902

Corrupting the test_data with 5% of missing values
Validating the model
[0.0109899] [0.01322329]
0.831101830241985

Corrupting the test_data with 10% of missing values
Validating the model
[0.00846226] [0.01021671]
0.8282766244933055

Corrupting the test_data with 20% of missing values
Validating the model
[0.00896757] [0.01085415]
0.8261884289399337

Corrupting the test_data with 50% of missing values
Validating the model
[0.00856444] [0.01045951]
0.8188183269868566

Corrupting the test_data with 70% of missing values
Validating the model
[0.00382621] [0.00472243]
0.8102198747082668

Corrupting the test_data with 90% of missing values
Validating the model
[0.00100675] [0.00125053]
0.8050608033411129

