In [14]:
import pandas as pd
import numpy as np
import itertools, random

from jenga.tasks.reviews import VideogameReviewsTask

from jenga.cleaning.ppp import PipelineWithPPP
from jenga.cleaning.autoclean import AutoClean

from jenga.corruptions.numerical import SwappedValues, Outliers, Scaling
from jenga.corruptions.text import BrokenCharacters
from jenga.corruptions.missing import ( MissingValuesHighEntropy, 
                                  MissingValuesLowEntropy, 
                                  MissingValues
                                )

corruptions = {
    'numeric':[Outliers, Scaling],
    'text': [BrokenCharacters],
    'missing': [MissingValuesHighEntropy, 
                MissingValuesLowEntropy, 
                MissingValues]
}

def get_random_perturbation(numerical_columns, 
                            categorical_columns, 
                            fractions=[.5, .7]):
    '''
    Returns a random perturbation, chosen from either a numeric (swapping, outlier, scaling)
    or missingness
    '''
    random_fraction = random.choice(fractions)
    missingness = random.choice(['MCAR', 'MAR', 'MNAR'])
    rand_column = random.choice(categorical_columns)
    return MissingValues(random_fraction, rand_column, None, missingness)


In [6]:
task = VideogameReviewsTask()

results = []

while task.advance_current_week():

    print("----- Week", task.current_week(), "-----")

    train_data = task.current_accumulated_train_data()
    train_labels = task.current_accumulated_train_labels()

    model = task.fit_baseline_model(train_data, train_labels)
    
    ac = AutoClean(train_data, 
                   train_labels, 
                   model, 
                   numerical_columns=task.numerical_attributes, 
                   categorical_columns=task.categorical_attributes,
                   text_columns=task.text_attributes
                  )
    
    test_data = task.current_test_data()
    predictions = model.predict_proba(test_data)
    clean_data_test_score = task.score_on_current_test_data(predictions)
    print("\tAUC on test data", clean_data_test_score)
    
    rand_perturbation = get_random_perturbation(task.numerical_attributes,
                                            task.categorical_attributes)
    corrupted_test_data = rand_perturbation(test_data.copy(deep=True))
    
    cleaned_test_data, ppp_score, cleaned_scores = ac(corrupted_test_data.copy(deep=True))
    cleaned_predictions = model.predict_proba(cleaned_test_data)
    cleaned_data_test_score = task.score_on_current_test_data(cleaned_predictions)
    
    # this is necessary to avoid crashes at prediction time, as 
    # the preprocessor simply passes on the nans and Nones
    corrupted_test_data[task.numerical_attributes] = corrupted_test_data[task.numerical_attributes].fillna(0)
    corrupted_test_data[task.categorical_attributes] = corrupted_test_data[task.categorical_attributes].fillna('')
    
    corrupted_predictions = model.predict_proba(corrupted_test_data)
    corrupted_data_test_score = task.score_on_current_test_data(corrupted_predictions)
    
    results.append({
        'perturbation': rand_perturbation,
        'clean_data_test_score': clean_data_test_score,
        'corrupted_data_test_score': corrupted_data_test_score,
        'cleaned_data_test_score': cleaned_data_test_score,
        'ppp_score': ppp_score,
        'cleaned_scores': cleaned_scores
    })
    print(f"\tScores for test data corrupted with {rand_perturbation.__dict__}")
    for k,v in results[-1].items():
        print(f'\t\t{k}:{v}')


----- Week 0 -----
Generating perturbed training data on 7798 rows ...
	... perturbation 0/78: swapped, col ('star_rating', 'total_votes'), fraction: 0.5
	... perturbation 1/78: swapped, col ('vine', 'verified_purchase'), fraction: 0.5
	... perturbation 2/78: scaling, col ['star_rating'], fraction: 0.5
	... perturbation 3/78: outlier, col ['star_rating'], fraction: 0.5
	... perturbation 4/78: missing_MCAR, col star_rating, fraction: 0.5
	... perturbation 5/78: missing_MAR, col star_rating, fraction: 0.5
	... perturbation 6/78: missing_MNAR, col star_rating, fraction: 0.5
	... perturbation 7/78: missing_MCAR, col verified_purchase, fraction: 0.5
	... perturbation 8/78: missing_MAR, col verified_purchase, fraction: 0.5
	... perturbation 9/78: missing_MNAR, col verified_purchase, fraction: 0.5
	... perturbation 10/78: missing_high_entropy, col ['vine'], fraction: 0.5
	... perturbation 11/78: missing_low_entropy, col ['vine'], fraction: 0.5
	... perturbation 12/78: broken_characters, col t

  if data_frame.columns.contains(imputation_col):
  if data_frame.columns.contains(imputation_proba_col):


no missing values detected in column star_rating
no missing values detected in column total_votes
PPP score with cleaning <class 'jenga.cleaning.imputation.DatawigImputation'>: 0.9435854338934067
Cleaning did not improve score


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  best_cleaning_idx = pd.Series(cleaner_results).argmax()


	Scores for test data corrupted with {'column': 'verified_purchase', 'fraction': 0.7, 'na_value': None, 'missingness': 'MCAR'}
		perturbation:<jenga.corruptions.missing.MissingValues object at 0x13c510ef0>
		clean_data_test_score:0.9692813074608936
		corrupted_data_test_score:0.9692813074608936
		cleaned_data_test_score:0.9692813074608936
		ppp_score:0.9435854338934067
		cleaned_scores:[0.9435854338934067, 0.9435854338934067]
----- Week 1 -----
Generating perturbed training data on 15530 rows ...
	... perturbation 0/78: swapped, col ('star_rating', 'total_votes'), fraction: 0.5
	... perturbation 1/78: swapped, col ('vine', 'verified_purchase'), fraction: 0.5
	... perturbation 2/78: scaling, col ['star_rating'], fraction: 0.5
	... perturbation 3/78: outlier, col ['star_rating'], fraction: 0.5
	... perturbation 4/78: missing_MCAR, col star_rating, fraction: 0.5
	... perturbation 5/78: missing_MAR, col star_rating, fraction: 0.5
	... perturbation 6/78: missing_MNAR, col star_rating, fract

  return np.log(probas)


Imputing 4742 missing values in column vine


  if data_frame.columns.contains(imputation_col):
  if data_frame.columns.contains(imputation_proba_col):


no missing values detected in column verified_purchase
no missing values detected in column star_rating
no missing values detected in column total_votes
PPP score with cleaning <class 'jenga.cleaning.imputation.DatawigImputation'>: 0.911051028731986
Best cleaning type: 0.911051028731986


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  best_cleaning_idx = pd.Series(cleaner_results).argmax()


	Scores for test data corrupted with {'column': 'vine', 'fraction': 0.7, 'na_value': None, 'missingness': 'MNAR'}
		perturbation:<jenga.corruptions.missing.MissingValues object at 0x1395e5978>
		clean_data_test_score:0.9703576751117735
		corrupted_data_test_score:0.9626158681441913
		cleaned_data_test_score:0.9703576751117735
		ppp_score:0.9023805824949394
		cleaned_scores:[0.911051028731986, 0.911051028731986]
----- Week 2 -----
Generating perturbed training data on 22305 rows ...
	... perturbation 0/78: swapped, col ('star_rating', 'total_votes'), fraction: 0.5
	... perturbation 1/78: swapped, col ('vine', 'verified_purchase'), fraction: 0.5
	... perturbation 2/78: scaling, col ['total_votes'], fraction: 0.5
	... perturbation 3/78: outlier, col ['total_votes'], fraction: 0.5
	... perturbation 4/78: missing_MCAR, col total_votes, fraction: 0.5
	... perturbation 5/78: missing_MAR, col total_votes, fraction: 0.5
	... perturbation 6/78: missing_MNAR, col total_votes, fraction: 0.5
	... p

  if data_frame.columns.contains(imputation_col):
  if data_frame.columns.contains(imputation_proba_col):


no missing values detected in column star_rating
no missing values detected in column total_votes
PPP score with cleaning <class 'jenga.cleaning.imputation.DatawigImputation'>: 0.9667028178343025
Cleaning did not improve score


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  best_cleaning_idx = pd.Series(cleaner_results).argmax()


	Scores for test data corrupted with {'column': 'verified_purchase', 'fraction': 0.7, 'na_value': None, 'missingness': 'MCAR'}
		perturbation:<jenga.corruptions.missing.MissingValues object at 0x13be5b8d0>
		clean_data_test_score:0.9726272923994546
		corrupted_data_test_score:0.9726272923994546
		cleaned_data_test_score:0.9726272923994546
		ppp_score:0.9667028178343025
		cleaned_scores:[0.9667028178343025, 0.9667028178343025]


In [15]:
results

[{'perturbation': <jenga.corruptions.missing.MissingValues at 0x13c510ef0>,
  'clean_data_test_score': 0.9692813074608936,
  'corrupted_data_test_score': 0.9692813074608936,
  'cleaned_data_test_score': 0.9692813074608936,
  'ppp_score': 0.9435854338934067,
  'cleaned_scores': [0.9435854338934067, 0.9435854338934067]},
 {'perturbation': <jenga.corruptions.missing.MissingValues at 0x1395e5978>,
  'clean_data_test_score': 0.9703576751117735,
  'corrupted_data_test_score': 0.9626158681441913,
  'cleaned_data_test_score': 0.9703576751117735,
  'ppp_score': 0.9023805824949394,
  'cleaned_scores': [0.911051028731986, 0.911051028731986]},
 {'perturbation': <jenga.corruptions.missing.MissingValues at 0x13be5b8d0>,
  'clean_data_test_score': 0.9726272923994546,
  'corrupted_data_test_score': 0.9726272923994546,
  'cleaned_data_test_score': 0.9726272923994546,
  'ppp_score': 0.9667028178343025,
  'cleaned_scores': [0.9667028178343025, 0.9667028178343025]}]

In [4]:
[r['perturbation'].__dict__ for r in results]

[{'column': 'vine', 'fraction': 0.5, 'na_value': None, 'missingness': 'MAR'},
 {'column': 'total_votes',
  'fraction': 0.7,
  'na_value': None,
  'missingness': 'MAR'},
 {'column': 'total_votes',
  'fraction': 0.5,
  'na_value': None,
  'missingness': 'MCAR'}]