In [1]:
import pandas as pd
import numpy as np
import itertools, random

from jenga.tasks.reviews import VideogameReviewsTask

from jenga.cleaning.ppp import PipelineWithPPP
from jenga.cleaning.autoclean import AutoClean

from jenga.corruptions.numerical import SwappedValues, Outliers, Scaling
from jenga.corruptions.text import BrokenCharacters
from jenga.corruptions.missing import ( MissingValuesHighEntropy, 
                                  MissingValuesLowEntropy, 
                                  MissingValues
                                )

corruptions = {
    'numeric':[Outliers, Scaling],
    'text': [BrokenCharacters],
    'missing': [MissingValuesHighEntropy, 
                MissingValuesLowEntropy, 
                MissingValues]
}

def get_random_perturbation(numerical_columns, 
                            categorical_columns, 
                            fractions=[.7, .9]):
    '''
    Returns a random perturbation, chosen from either a numeric (swapping, outlier, scaling)
    or missingness
    '''
    random_fraction = random.choice(fractions)
    
    num_or_cat = np.random.rand() > .5
    
    if num_or_cat:
        random_perturbation = random.choice(corruptions['numeric'])
        rand_column = [random.choice(numerical_columns)]
        return random_perturbation(random_fraction, rand_column)
    else:
        missingness = random.choice(['MCAR', 'MAR', 'MNAR'])
        rand_column = random.choice(categorical_columns + numerical_columns)
        return MissingValues(random_fraction, rand_column, None, missingness)
    

In [None]:
task = VideogameReviewsTask()

results = []

while task.advance_current_week():

    print("----- Week", task.current_week(), "-----")

    train_data = task.current_accumulated_train_data()
    train_labels = task.current_accumulated_train_labels()

    model = task.fit_baseline_model(train_data, train_labels)
    
    ac = AutoClean(train_data, 
                   train_labels, 
                   model, 
                   numerical_columns=task.numerical_attributes, 
                   categorical_columns=task.categorical_attributes,
                   text_columns=task.text_attributes
                  )
    
    test_data = task.current_test_data()
    predictions = model.predict_proba(test_data)
    clean_data_test_score = task.score_on_current_test_data(predictions)
    print("\tAUC on test data", clean_data_test_score)
    
    rand_perturbation = get_random_perturbation(task.numerical_attributes,
                                            task.categorical_attributes)
    corrupted_test_data = rand_perturbation(test_data.copy(deep=True))
    
    # this is necessary to avoid crashes at prediction time, as 
    # the preprocessor simply passes on the nans and Nones
    corrupted_test_data[task.numerical_attributes].fillna(0,inplace=True)
    corrupted_test_data[task.categorical_attributes].fillna('',inplace=True)
    
    corrupted_predictions = model.predict_proba(corrupted_test_data)
    corrupted_data_test_score = task.score_on_current_test_data(corrupted_predictions)
    
    
    cleaned_test_data, ppp_score, cleaned_scores = ac(corrupted_test_data)
    cleaned_predictions = model.predict_proba(cleaned_test_data)
    cleaned_data_test_score = task.score_on_current_test_data(cleaned_predictions)
    results.append({
        'perturbation': rand_perturbation,
        'clean_data_test_score': clean_data_test_score,
        'corrupted_data_test_score': corrupted_data_test_score,
        'cleaned_data_test_score': cleaned_data_test_score,
        'ppp_score': ppp_score,
        'cleaned_scores': cleaned_scores
    })
    print(f"\tScores for test data corrupted with {rand_perturbation.__dict__}")
    for k,v in results[-1].items():
        print(f'\t\t{k}:{v}')


----- Week 0 -----


In [None]:
results

In [None]:
[r['perturbation'].__dict__ for r in results]