In [1]:
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

import pandas as pd
import numpy as np
import os
import json

from scipy import sparse

## Read processed documents

In [2]:
loaded = sparse.load_npz("./mnli_government_travel/corpus_mat.npz")

In [3]:
loaded.toarray()

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [4]:
loaded.shape

(1976, 2459)

In [5]:
corpus_binary_dense = loaded.toarray()

In [6]:
with open('./mnli_government_travel/input_columns.json', 'r') as json_input:
    data = json.load(json_input)
    
input_columns = data['input_columns']

## Rule Generation

In [7]:
'''get BERT model prediction and ground truth'''
model_output = pd.read_csv(filepath_or_buffer="./mnli_government_travel/model_output.csv")
model_output.head()

Unnamed: 0,y_pred,y_gt
0,1,1
1,0,2
2,2,2
3,0,0
4,1,1


In [8]:
is_error = np.array(model_output['y_gt'] != model_output['y_pred']).astype(int)
model_output['is_error'] = is_error

In [9]:
np.unique(is_error, return_counts=True)

(array([0, 1]), array([1424,  552]))

In [10]:
corpus_binary_dense.shape

(1976, 2459)

In [11]:
import debug_rule

In [12]:
filter_threshold = {
    'support': 20,
    'err_rate': .27,
}

drule_obj = debug_rule.DebugRule()

drule_obj.initialize(corpus_binary_dense, is_error, filter_threshold, verbose=True).train_surrogate_random_forest()

***** finish training surrogate random forest *****


<debug_rule.DebugRule at 0x7fcdb800a640>

In [13]:
# discover error-prone subpopulations
drule_obj.extract_token_rule()

# calcuate p-value of the error rate in the subpopulation
drule_obj.calculate_pval()

# calculate 95% confidence interval of the error rate in the subpopulation
drule_obj.calculate_ci()

tokens with importance > 0, 43


In [14]:
len(drule_obj.rules)

108

In [15]:
def output_rules(to_output, columns, good_cols, good_idx, dataname):
    filename = "./" + dataname + "/list.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    with open(filename, 'w') as output:
        output.write(json.dumps(to_output))
        
    filename = "./" + dataname + "/test.json"
    '''get column frequency'''
    with open(filename, 'w') as output:
        output.write(json.dumps({
            'columns': columns,
            'good_cols': good_cols,
            'good_idx': good_idx,
        }))

In [16]:
def generate_histogram(rule_lists):
    num_bin = 20
        
    hist = np.zeros(num_bin)
    for rule in rule_lists:
        pos_bin = int(np.floor(rule['err_rate'] * num_bin))
        if (pos_bin==num_bin):
            pos_bin = num_bin-1
        hist[pos_bin] += 1
    return hist
    

to_output = {'rule_lists': drule_obj.rules, 'target_names': ['correct', 'errors'], "top_list": drule_obj.top_token_list}
hist = generate_histogram(drule_obj.rules)
good_cols = [input_columns[x] for x in drule_obj.good_token_idx]

to_output['histogram'] = hist.tolist()

output_rules(to_output, input_columns, good_cols, drule_obj.good_token_idx.tolist(), 'mnli_government_travel_binary')