In [1]:
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

import pandas as pd
import numpy as np

import nltk
import spacy

import os
import json

import altair as alt

In [2]:
df = pd.read_json("./mnli_government_travel/doc.jsonl", lines=False, orient='records')

In [3]:
df.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,Calcutta seems to be the only other production...,Most of Mrinal Sen's work can be found in Euro...,neutral
1,The most important directions are simply up an...,"Go downwards to one of the gates, all of which...",contradiction
2,The bhakti movement of the Tamils brought a ne...,The Tamils' bhakti movement froze the previous...,contradiction
3,Buffet and a la carte available.,It has a buffet.,entailment
4,Not quite as large is the Papal Crose commemor...,Pope John Paul II also visited in 1983.,neutral


In [4]:
df.shape

(1976, 3)

In [5]:
model_output = pd.read_csv(filepath_or_buffer = "./mnli_government_travel/model_output.csv", header = 0)
model_output

Unnamed: 0,y_pred,y_gt
0,1,1
1,0,2
2,2,2
3,0,0
4,1,1
...,...,...
1971,0,0
1972,2,1
1973,0,0
1974,0,0


In [6]:
df['y_pred'] = model_output['y_pred'].values
df['y_gt'] = model_output['y_gt'].values

In [7]:
df.head()

Unnamed: 0,sentence1,sentence2,gold_label,y_pred,y_gt
0,Calcutta seems to be the only other production...,Most of Mrinal Sen's work can be found in Euro...,neutral,1,1
1,The most important directions are simply up an...,"Go downwards to one of the gates, all of which...",contradiction,0,2
2,The bhakti movement of the Tamils brought a ne...,The Tamils' bhakti movement froze the previous...,contradiction,2,2
3,Buffet and a la carte available.,It has a buffet.,entailment,0,0
4,Not quite as large is the Papal Crose commemor...,Pope John Paul II also visited in 1983.,neutral,1,1


In [8]:
is_error = (df['y_gt'] != df['y_pred']).astype(int)
is_error

0       0
1       1
2       0
3       0
4       0
       ..
1971    0
1972    1
1973    0
1974    0
1975    0
Length: 1976, dtype: int64

In [9]:
is_error.sum()/is_error.shape[0]

0.2793522267206478

In [10]:
key_list = ['y_gt', 'y_pred']

In [11]:
to_save = {}
charts = []
for key in key_list:
    stat_df = pd.DataFrame()
    stat_df[key] = df[key]
    stat_df['is_error'] = is_error
    to_render = stat_df.groupby([key]).sum().reset_index()
    to_render['tot'] = stat_df.groupby([key]).count().reset_index()['is_error']
    charts.append(alt.Chart(to_render).mark_bar(opacity=.5).encode(
        x= key+':N',
        y= 'tot:Q'
    )+alt.Chart(to_render).mark_bar().encode(
        x= key+':N', 
        y= 'is_error:Q'
    ))
    
    to_save['by_'+key] = to_render.to_dict("index")

In [12]:
alt.hconcat(charts[0],charts[1]).resolve_scale(
    y='shared'
)

In [13]:
'''save to file'''
with open("./mnli_government_travel/model_stat.json", "w") as json_output:
    json_output.write(json.dumps(to_save))

## Extract high-level features:  percentage of noun/adj/adv/verb/num/pron.

In [None]:
'''tokenization'''
from gensim.utils import simple_preprocess

data_word_list = [simple_preprocess(x['sentence1']+" <S> "+x['sentence2']) for i,x in df.iterrows()]

In [15]:
premise = [x['sentence1'] for i,x in df.iterrows()]
hypothesis = [x['sentence2'] for i,x in df.iterrows()]

In [16]:
nlp = spacy.load('en_core_web_sm')
pssage_ready = []

for sent in data_word_list:
    # Parse the sentence using the loaded 'en' model object `nlp`. Extract the lemma for each token and join
    doc = nlp(" ".join(sent))
    pssage_ready.append([token.pos_ for token in doc])
    

In [17]:
p_count = np.unique(pd.DataFrame(pssage_ready)[0], return_counts=True)

In [18]:
allowed_postags=['ADJ', 'ADV', 'NOUN', 'NUM', 'VERB', 'PRON']


In [19]:
np.unique(pssage_ready[1], return_counts=True) 

(array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN', 'NUM', 'PRON',
        'VERB'], dtype='<U5'),
 array([3, 9, 8, 2, 3, 9, 8, 3, 1, 5]))

In [20]:
word_type_dict = {}
word_type_dlist = []

for i in range(len(pssage_ready)):
    p_count = np.unique(pssage_ready[i], return_counts=True)
    word_type_dict = {}
    for ix, word in enumerate(p_count[0]):
        if word in allowed_postags:
            word_type_dict[word] = p_count[1][ix]/len(pssage_ready[i])
    word_type_dlist.append(word_type_dict)  

In [21]:
pd.DataFrame(word_type_dlist)

Unnamed: 0,ADJ,ADV,NOUN,PRON,VERB,NUM
0,0.145455,0.090909,0.163636,0.036364,0.127273,
1,0.058824,0.156863,0.156863,0.019608,0.098039,0.058824
2,0.178571,0.035714,0.392857,,0.071429,
3,0.125000,,0.250000,0.125000,,
4,0.093750,0.125000,0.062500,,0.156250,0.062500
...,...,...,...,...,...,...
1971,,0.090909,0.363636,0.045455,0.090909,
1972,0.250000,,0.500000,,0.250000,
1973,0.105263,,0.368421,,0.157895,
1974,,0.055556,0.166667,,0.166667,


In [22]:
doc_pos_df = pd.DataFrame(word_type_dlist).fillna(0)
doc_pos_df.head()

Unnamed: 0,ADJ,ADV,NOUN,PRON,VERB,NUM
0,0.145455,0.090909,0.163636,0.036364,0.127273,0.0
1,0.058824,0.156863,0.156863,0.019608,0.098039,0.058824
2,0.178571,0.035714,0.392857,0.0,0.071429,0.0
3,0.125,0.0,0.25,0.125,0.0,0.0
4,0.09375,0.125,0.0625,0.0,0.15625,0.0625


## Extract high-level features: document length, overlap extent

In [23]:
from collections import Counter

def get_overlapping_rate(list1, list2):
    overlap = len(list((Counter(list1) & Counter(list2)).elements()))
    tot = np.min([len(list1), len(list2)])
    return overlap/tot

In [24]:
overlap_rate = []
doc_len = []
for i in range(len(data_word_list)):
    doc_len.append(len(data_word_list[i]))
    overlap_rate.append(get_overlapping_rate(premise[i], hypothesis[i]))


In [25]:
hfeat_df = pd.DataFrame(word_type_dlist).fillna(0)
hfeat_df['doc_len'] = doc_len
hfeat_df['overlap'] = overlap_rate
hfeat_df['pred'] = df['y_pred']

# use either pred or ground truth for rule generation
# hfeat_df['label'] = df['y_gt']

hfeat_df.head()

Unnamed: 0,ADJ,ADV,NOUN,PRON,VERB,NUM,doc_len,overlap,pred
0,0.145455,0.090909,0.163636,0.036364,0.127273,0.0,55,0.984127,1
1,0.058824,0.156863,0.156863,0.019608,0.098039,0.058824,51,0.9875,0
2,0.178571,0.035714,0.392857,0.0,0.071429,0.0,28,0.90411,2
3,0.125,0.0,0.25,0.125,0.0,0.0,8,0.8125,0
4,0.09375,0.125,0.0625,0.0,0.15625,0.0625,32,0.948718,1


In [26]:
hfeat_df.columns

Index(['ADJ', 'ADV', 'NOUN', 'PRON', 'VERB', 'NUM', 'doc_len', 'overlap',
       'pred'],
      dtype='object')

## Rule Extraction
features:
- doc length
- q/a overlapping
- pos tagging percentage
- labels
- model predictions

features may be included for other tasks/models/data:
- qtype

In [27]:
is_error.values

array([0, 1, 0, ..., 0, 0, 0])

In [28]:
np.unique(is_error, return_counts=True)

(array([0, 1]), array([1424,  552]))

In [29]:
def output_rules(to_output, columns, thresholds, dataname):
    filename = "./" + dataname + "/list.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    with open(filename, 'w') as output:
        output.write(json.dumps(to_output))
        
    filename = "./" + dataname + "/test.json"
    '''get column frequency'''

    with open(filename, 'w') as output:
        output.write(json.dumps({
            'columns': columns,
            'thresholds': thresholds,
        }))

In [30]:
import debug_rule

In [None]:
filter_threshold = {
    'support': 20,
    'err_rate': .27,
}

drule_obj = debug_rule.DebugRule()

drule_obj.initialize(hfeat_df.values, is_error.values, filter_threshold, verbose=True)\
.numerical2ordinal().extract_high_level_rule()

In [32]:
hfeat_df.values.shape

(1976, 9)

In [33]:
drule_obj.calculate_pval()
drule_obj.calculate_ci()

In [34]:
def generate_histogram(rule_lists):
    num_bin = 20
        
    hist = np.zeros(num_bin)
    for rule in rule_lists:
        pos_bin = int(np.floor(rule['err_rate'] * num_bin))
        if (pos_bin==num_bin):
            pos_bin = num_bin-1
        hist[pos_bin] += 1
    return hist

to_output = {'rule_lists': drule_obj.rules, 'target_names': ['correct', 'errors'], "top_list": drule_obj.top_hfeat_list}
hist = generate_histogram(drule_obj.rules)

to_output['histogram'] = hist.tolist()

output_rules(to_output, hfeat_df.columns.tolist(), drule_obj.thresholds.tolist(), 'mnli_government_travel_hfeat')

In [35]:
ordinal_df = pd.DataFrame(data=drule_obj.all, columns=hfeat_df.columns)

# add y_label (ground truth)
ordinal_df['label'] = model_output['y_gt']

# save to disk
ordinal_df.to_csv(path_or_buf="./mnli_government_travel/hfeat_stat.csv", index=None)