In [1]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import tensorflow_data_validation as tfdv
from google.protobuf import text_format
import tensorflow as tf
import pandas as pd
import numpy as np
import os

import sys
sys.path.append('..') 

from ssc.hilda.experiments import *
from ssc.hilda.datasets import *
from ssc.hilda.learners import *
from analyzers import DataType
from perturbations import *

import warnings
warnings.simplefilter("ignore")

### TFDV validation routine

In [2]:
def tfdv_validate(X_train, X_test, sep=','):
    train_filename, test_filename = '/tmp/X_train.csv', '/tmp/X_test.csv'
    
    # Saving temporary csv files for stats generation
    X_train.to_csv(train_filename, sep=sep)
    X_test.to_csv(test_filename, sep=sep)
    
    # Generating tfdv stats
    train_stats = tfdv.generate_statistics_from_csv(train_filename, delimiter=sep)
    test_stats = tfdv.generate_statistics_from_csv(test_filename, delimiter=sep)

    # Infering schema, looking for anomalies, reporting
    schema = tfdv.infer_schema(train_stats)
    anomalies = tfdv.validate_statistics(statistics=test_stats, schema=schema)
    tfdv.display_anomalies(anomalies)

### Proposed system, as a with-clause context manager

In [3]:
def percentiles_of_probas(predictions):
        probs_class_a = np.transpose(predictions)[0]
        probs_class_b = np.transpose(predictions)[1]
        features_a = np.percentile(probs_class_a, np.arange(0, 101, 5))
        features_b = np.percentile(probs_class_b, np.arange(0, 101, 5))
        return np.concatenate((features_a, features_b), axis=0)


class auto_validating:
    def __init__(self, estimator, check_columns=[], check_for=MissingValues):
        self.estimator = estimator
        self.columns_to_check = check_columns
        self.err_gen = check_for
        
    def __enter__(self):
        return self
    
    def __exit__(self, type, value, traceback):
        pass
    
    def __percentiles_of_probas(self, predictions):
        return percentiles_of_probas(predictions)
    
    def __gen_perturbations(self, dataset_size=500):
        # for fraction_of_values_to_delete in [0.01, 0.05, 0.25, 0.5, 0.75, 0.99]:
        for _ in range(dataset_size):
            yield self.err_gen.on(np.random.random(), self.columns_to_check)
    
    def fit(self, data, labels):
        print("Intercepting training...")
        
        # training the model
        model = self.estimator.fit(data, labels)
        
        # TODO We need to train a validation model for missing values here
        
        X, y = [], []
        for perturbation in self.__gen_perturbations():
            # Corrupting the data
            corrupted_data = perturbation.transform(data)
            # Generating features based on model prediction, corrupted data
            X.append(self.__percentiles_of_probas(model.predict_proba(corrupted_data)))
            # Recording model performance on corrupted data as a label
            y.append(model.score(corrupted_data, labels))

        # Building and training the pipeline
        param_grid = {
            'learner__n_estimators': np.arange(5, 20, 5),
            'learner__criterion': ['mae']
        }

        meta_regressor_pipeline = Pipeline([
            ('scaling', StandardScaler()),
            ('learner', RandomForestRegressor(criterion='mae'))
        ])
        
        self.meta_regressor = (GridSearchCV(meta_regressor_pipeline,
                                            param_grid,
                                            scoring='neg_mean_absolute_error')
                               .fit(np.array(X), np.array(y)))
        
        return PredictionInterceptor(model, self.meta_regressor)
        
    
class PredictionInterceptor:
    def __init__(self, transformer, meta_regressor):
        self.transformer = transformer
        self.meta_regressor = meta_regressor
        
    def __percentiles_of_probas(self, predictions):
        return percentiles_of_probas(predictions)
        
    def score(self, data, labels):
        print("Intercepting prediction...")
        
        # TODO We need to apply the validation model here
        
        threshold = .01
        features = self.__percentiles_of_probas(self.transformer.predict_proba(data))
        predicted_score = self.meta_regressor.predict(features.reshape(1, -1))
        real_score = self.transformer.score(data, labels)
        diff = np.abs(real_score - predicted_score)
        ratio = diff / real_score
        print diff, ratio
        if ratio > threshold:
            print("*** WARNING! Performance drop: %.4f > %.2f, scores deviate by %.4f" % (ratio, threshold, diff))
        else:
            print("*** Everything is fine")
        
        return real_score

In [4]:
np.random.seed(47)

dataset = BalancedAdultDataset()
learner = LogisticRegression('accuracy')

X_train, X_test, X_target = learner.split(dataset.df)

y_train = dataset.labels_from(X_train)
y_test = dataset.labels_from(X_test)
y_target = dataset.labels_from(X_target)

# Creating a pipeline
feature_transformation = ColumnTransformer(transformers=[
    ('categorical', OneHotEncoder(handle_unknown='ignore'), ['workclass', 'occupation', 'marital_status', 'education']),
    ('numeric', StandardScaler(), ['hours_per_week', 'age'])
])

pipeline = Pipeline([
    ('features', feature_transformation),
    ('learner', SGDClassifier(loss='log'))
])

In [5]:
with auto_validating(pipeline, check_columns=['education'], check_for=MissingValues(-1)) as validatable_pipeline:
    model = validatable_pipeline.fit(X_train, y_train)

    model.score(X_test, y_test)

    for missing_value_ratio in [.01, .05, .1, .2, .5, .7]:
        print("Corrupting the test_data with %d%% of missing values" % (int(round(100*missing_value_ratio)),))
        corrupted_X_test = MissingValues(-1).on(missing_value_ratio, ['education']).transform(X_test)
        tfdv_validate(X_train, corrupted_X_test)
        print(model.score(corrupted_X_test, y_test))
        print

Intercepting training...
Intercepting prediction...
[0.00459706] [0.00577117]
*** Everything is fine
Corrupting the test_data with 1% of missing values


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~1%).


Intercepting prediction...
[0.00410815] [0.00515739]
*** Everything is fine
0.7965561224489796

Corrupting the test_data with 5% of missing values


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~4%).


Intercepting prediction...
[0.00461305] [0.00579589]
*** Everything is fine
0.7959183673469388

Corrupting the test_data with 10% of missing values


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~10%).


Intercepting prediction...
[0.00559097] [0.00703582]
*** Everything is fine
0.7946428571428571

Corrupting the test_data with 20% of missing values


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~20%).


Intercepting prediction...
[0.01034242] [0.01313111]
0.7876275510204082

Corrupting the test_data with 50% of missing values


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~50%).


Intercepting prediction...
[0.01078958] [0.01386726]
0.7780612244897959

Corrupting the test_data with 70% of missing values


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'education',Unexpected string values,Examples contain values missing from the schema: -1 (~70%).


Intercepting prediction...
[2.79639556e-05] [3.57355195e-05]
*** Everything is fine
0.7825255102040817



In [6]:
columns = dataset.numerical_columns
with auto_validating(pipeline, check_columns=columns, check_for=Outliers()) as validatable_pipeline:
    model = validatable_pipeline.fit(X_train, y_train)

    model.score(X_test, y_test)

    for missing_value_ratio in [.01, .05, .1, .2, .5, .7]:
        print("Corrupting the test_data with %d%% of missing values" % (int(round(100*missing_value_ratio)),))
        corrupted_X_test = Outliers().on(missing_value_ratio, columns).transform(X_test)
        tfdv_validate(X_train, corrupted_X_test)
        print(model.score(corrupted_X_test, y_test))
        print

Intercepting training...
Intercepting prediction...
[0.00673864] [0.00839253]
*** Everything is fine
Corrupting the test_data with 1% of missing values


Intercepting prediction...
[0.00610089] [0.00760429]
*** Everything is fine
0.8022959183673469

Corrupting the test_data with 5% of missing values


Intercepting prediction...
[0.00437842] [0.00551435]
*** Everything is fine
0.7940051020408163

Corrupting the test_data with 10% of missing values


Intercepting prediction...
[0.00048271] [0.00061187]
*** Everything is fine
0.7889030612244898

Corrupting the test_data with 20% of missing values


Intercepting prediction...
[0.00143207] [0.00187124]
*** Everything is fine
0.7653061224489796

Corrupting the test_data with 50% of missing values


Intercepting prediction...
[0.02620103] [0.03786472]
0.6919642857142857

Corrupting the test_data with 70% of missing values


Intercepting prediction...
[0.0343498] [0.05038399]
0.6817602040816326



In [11]:
dataset = TrollingDataset()
learner = LogisticRegression('accuracy')

X_train, X_test, X_target = learner.split(dataset.df)

y_train = dataset.labels_from(X_train)
y_test = dataset.labels_from(X_test)
y_target = dataset.labels_from(X_target)

missing_value_ratio = .1
corrupted_X_test = Leetspeak('label', 1).on(missing_value_ratio, 'content').transform(X_test)
tfdv_validate(X_train, corrupted_X_test, sep='\t')

# Creating a pipeline
feature_transformation = ColumnTransformer(transformers=[
    ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=100000), 'content'),
], sparse_threshold=1.0)

param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

pipeline = Pipeline([
    ('features', feature_transformation), 
    ('learner', SGDClassifier(loss='log', max_iter=1000))])

ValueError: No line was found. [while running 'DecodeData/ParseCSVRecords']

In [10]:
with auto_validating(pipeline, check_columns='content', check_for=Leetspeak('label', 1)) as validatable_pipeline:
    model = validatable_pipeline.fit(X_train, y_train)

    model.score(X_test, y_test)

    for missing_value_ratio in [.01, .05, .1, .2, .5, .7]:
        print("Corrupting the test_data with %d%% of missing values" % (int(round(100*missing_value_ratio)),))
        corrupted_X_test = Leetspeak('label', 1).on(missing_value_ratio, 'content').transform(X_test)
        tfdv_validate(X_train, corrupted_X_test, sep='\t')
        print(model.score(corrupted_X_test, y_test))
        print

Intercepting training...
Intercepting prediction...
[0.05387785] [0.06868821]
Corrupting the test_data with 1% of missing values


ValueError: No line was found. [while running 'DecodeData/ParseCSVRecords']