In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from Clustering import Clustering
from DataNoiseAdder import DataNoiseAdder
from DatasetCorruptor import DatasetCorruptor
from DecisionTreeEnsemble import DecisionTreeEnsemble
from SyntheticDataGenerator import SyntheticDataGenerator
from EnsembleDiversity import EnsembleDiversity
from EnsembleMetrics import EnsembleMetrics

from utils import get_dataset, get_ensemble_preds_from_models, get_precision_recall_auc, auprc_threshs
from utils import plot_precision_recall, plot_aroc_at_curve, fitness_scatter
from utils import compute_metrics_in_buckets, flatten_df, compute_cluster_metrics

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

### Experiment Args

In [2]:
args = {}
args['ntrls'] = 10
args['ensemble_size'] = 10
args['dataset_path'] = "/Users/scottmerrill/Documents/UNC/Research/OOD-Ensembles/datasets"
args['dataset_name'] = 'german' 

# Decision Tree/Model Pool Params
args['num_classifiers'] = 100
args['feature_fraction'] = 0.5
args['data_fraction'] = 0.8
args['max_depth'] = 10
args['min_samples_leaf'] = 5
args['random_state'] = 1
args['clusters_list'] = [3]
args['shift_feature_count'] = 5

AUCTHRESHS = np.array([0.1, 0.2, 0.3, 0.4, 1. ])

In [3]:
x_train, y_train, x_val_id, y_val_id, x_val_ood, y_val_ood = get_dataset(args['dataset_path'] , args['dataset_name'])
num_features = x_train.shape[1]

In [5]:
model_pool = DecisionTreeEnsemble(args['num_classifiers'], 
                                  args['feature_fraction'],
                                  args['data_fraction'],
                                  args['max_depth'],
                                  args['min_samples_leaf'],
                                  args['random_state'])

# train model pool
model_pool.train(x_train, y_train)

In [6]:
def get_categorical_and_float_features(data, unique_threshold=10):
    """
    Separate categorical and float features based on the number of unique values.
    
    - data: numpy array (rows: samples, cols: features)
    - unique_threshold: max number of unique values to consider a feature as categorical
    
    Returns:
    - categorical_features: list of indices for categorical features
    - float_features: list of indices for float (continuous) features
    """
    categorical_features = []
    float_features = []

    # Iterate over each column
    for col in range(data.shape[1]):
        unique_values = np.unique(data[:, col])
        if len(unique_values) <= unique_threshold:
            categorical_features.append(col)  # Consider it categorical if the unique values are below the threshold
        else:
            float_features.append(col)  # Otherwise, it's treated as a float/continuous feature

    return categorical_features, float_features

## Get most correlated shift count features

In [8]:
# Convert x_train and y_train to DataFrames (assuming x_train has multiple features)
x_train_df = pd.DataFrame(x_train)
y_train_df = pd.Series(y_train)

# Concatenate x_train_df and y_train_df
data = pd.concat([x_train_df, y_train_df], axis=1)

# Calculate correlation matrix
correlation_matrix = data.corr()

# Extract correlations between each feature and the target variable (y_train is the last column)
correlations_with_target = correlation_matrix.iloc[:-1, -1]

# Sort correlations in descending order
sorted_correlations = correlations_with_target.abs().sort_values(ascending=False)

In [9]:
# we ill add noise to these features
categorical_cols, float_cols = get_categorical_and_float_features(x_train)

add_nosie_feats = sorted_correlations.index[:args['shift_feature_count'] ]

add_nosie_feats_float = [x for x in add_nosie_feats if x in float_cols]
add_nosie_feats_categorical = [x for x in add_nosie_feats if x in categorical_cols]

In [10]:
add_nosie_feats_categorical

[8, 6, 10, 26]

In [11]:
train_noise = DataNoiseAdder(x_train)
val_noise = DataNoiseAdder(x_val_id)

### Guassian

In [12]:
x_train_noise = train_noise.add_gaussian_noise(add_nosie_feats_float)
x_train_noise = train_noise.add_categorical_noise(add_nosie_feats_categorical)

x_val_noise = val_noise.add_gaussian_noise(add_nosie_feats_float)
x_val_noise = val_noise.add_categorical_noise(add_nosie_feats_categorical)

model_pool.train_gaussian_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_gaussian_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_gaussian_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_gaussian_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

### Uniform

In [13]:
x_train_noise = train_noise.add_uniform_noise(add_nosie_feats_float)
x_train_noise = train_noise.add_categorical_noise(add_nosie_feats_categorical)

x_val_noise = val_noise.add_uniform_noise(add_nosie_feats_float)
x_val_noise = val_noise.add_categorical_noise(add_nosie_feats_categorical)

model_pool.train_uniform_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_uniform_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_uniform_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_uniform_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

### Laplace

In [14]:
x_train_noise = train_noise.add_laplace_noise(add_nosie_feats_float)
x_train_noise = train_noise.add_categorical_noise(add_nosie_feats_categorical)

x_val_noise = val_noise.add_laplace_noise(add_nosie_feats_float)
x_val_noise = val_noise.add_categorical_noise(add_nosie_feats_categorical)

model_pool.train_laplace_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_laplace_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_laplace_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_laplace_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

### Dropout noise

In [15]:
x_train_noise = train_noise.add_dropout_noise(add_nosie_feats_float)
x_train_noise = train_noise.add_categorical_noise(add_nosie_feats_categorical)

x_val_noise = val_noise.add_dropout_noise(add_nosie_feats_float)
x_val_noise = val_noise.add_categorical_noise(add_nosie_feats_categorical)

model_pool.train_dropout_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_dropout_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_dropout_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_dropout_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

### Boundary Shift

In [16]:
x_train_noise = train_noise.add_concept_shift(shift_type="boundary_shift",
                                             shift_params={'feature_col':float_cols[0]})
x_train_noise = train_noise.add_categorical_noise(add_nosie_feats_categorical)

x_val_noise = train_noise.add_concept_shift(shift_type="boundary_shift",
                                             shift_params={'feature_col':float_cols[0]})
x_val_noise = val_noise.add_categorical_noise(add_nosie_feats_categorical)

model_pool.train_boundaryshift_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_boundaryshift_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_boundaryshift_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_boundaryshift_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

### Scaling Shift

In [17]:
x_train_noise = train_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='scaling',
                                shift_params = {'scale_factor':1.2})


x_val_noise = train_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='scaling',
                                shift_params = {'scale_factor':1.2})

model_pool.train_upscaleshift_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_upscaleshift_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_upscaleshift_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_upscaleshift_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

In [18]:
x_train_noise = train_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='scaling',
                                shift_params = {'scale_factor':0.8})


x_val_noise = train_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='scaling',
                              shift_params = {'scale_factor':0.8})

model_pool.train_downscaleshift_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_downscaleshift_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_downscaleshift_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_downscaleshift_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

### Distribution shift

In [19]:
x_train_noise = train_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='distribution',
                                shift_params = {'dist_type':'uniform'})
x_val_noise = val_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='distribution',
                                shift_params = {'dist_type':'uniform'})

model_pool.train_distshiftuniform_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_distshiftuniform_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_distshiftuniform_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_distshiftuniform_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

In [20]:
x_train_noise = train_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='distribution',
                                shift_params = {'dist_type':'normal'})


x_val_noise = val_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='distribution',
                                shift_params = {'dist_type':'normal'})

In [21]:
x_train_noise = train_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='distribution',
                                shift_params = {'dist_type':'uniform'})
x_val_noise = val_noise.add_covariate_shift(add_nosie_feats_float, 
                                shift_type='distribution',
                                shift_params = {'dist_type':'uniform'})

model_pool.train_distshiftgaussian_preds = model_pool.get_individual_predictions(x_train_noise).T
model_pool.train_distshiftgaussian_pred_probs = model_pool.get_individual_probabilities(x_train_noise)

model_pool.val_distshiftgaussian_preds = model_pool.get_individual_predictions(x_val_noise).T
model_pool.val_distshiftgaussian_pred_probs = model_pool.get_individual_probabilities(x_val_noise)
del x_train_noise, x_val_noise

### label Shift

In [22]:
y_train_flipped = DataNoiseAdder.label_flip(y_train)
y_val_flipped = DataNoiseAdder.label_flip(y_val_id)

In [29]:
a = [x for x in dir(model_pool) if 'preds' in x]
b = [x for x in dir(model_pool) if 'pred_' in x]
attributes = [(a[i], b[i], a[i].split('_')[1]) for i in range(len(a))]

In [43]:
attributes = [(a[i], b[i], a[i].split('_')[0]+'_'+a[i].split('_')[1]) for i in range(len(a))]

In [44]:
getattr(model_pool, 'val_upscaleshift_preds')

array([[0., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [0., 1., 1., ..., 1., 0., 1.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [1., 1., 1., ..., 0., 1., 1.],
       [0., 1., 1., ..., 0., 0., 0.]])

In [45]:
attributes

[('train_boundaryshift_preds',
  'train_boundaryshift_pred_probs',
  'train_boundaryshift'),
 ('train_distshiftgaussian_preds',
  'train_distshiftgaussian_pred_probs',
  'train_distshiftgaussian'),
 ('train_distshiftuniform_preds',
  'train_distshiftuniform_pred_probs',
  'train_distshiftuniform'),
 ('train_downscaleshift_preds',
  'train_downscaleshift_pred_probs',
  'train_downscaleshift'),
 ('train_dropout_preds', 'train_dropout_pred_probs', 'train_dropout'),
 ('train_gaussian_preds', 'train_gaussian_pred_probs', 'train_gaussian'),
 ('train_laplace_preds', 'train_laplace_pred_probs', 'train_laplace'),
 ('train_uniform_preds', 'train_uniform_pred_probs', 'train_uniform'),
 ('train_upscaleshift_preds',
  'train_upscaleshift_pred_probs',
  'train_upscaleshift'),
 ('val_boundaryshift_preds',
  'val_boundaryshift_pred_probs',
  'val_boundaryshift'),
 ('val_distshiftgaussian_preds',
  'val_distshiftgaussian_pred_probs',
  'val_distshiftgaussian'),
 ('val_distshiftuniform_preds',
  'val_di

In [4]:
x_train

array([[ 2.23649784,  0.11253436,  0.89375719, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.57679816, -0.18336508,  0.89375719, ...,  0.        ,
         1.        ,  0.        ],
       [-0.75096158, -0.66155009,  0.00225696, ...,  1.        ,
         1.        ,  1.        ],
       ...,
       [-0.99991653, -0.78055703,  0.89375719, ...,  0.        ,
         1.        ,  0.        ],
       [ 2.23649784,  0.24376558,  0.89375719, ...,  0.        ,
         1.        ,  1.        ],
       [-0.75096158, -0.82765646,  0.89375719, ...,  0.        ,
         0.        ,  0.        ]])