In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# ! pip3 install dalex
# ! pip install lime
! pip install pyexplainer

In [None]:
import csv
import numpy as np
import pandas as pd
from numpy import where
from pyexplainer import pyexplainer_pyexplainer
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.utils import resample
np.random.seed(1)

def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

def get_rank_diff(list1, list2):
  avg_diff = 0
  for rd_idx in range(len(list1)):
    rank_str = list1[rd_idx]
    diff = 0
    try:
      diff = abs(list2.index(rank_str) - rd_idx)
    except ValueError:
      diff = len(list1)
    avg_diff += diff
  return (avg_diff / len(list1))

def get_hit_rate(list1, list2):
  hr = 0
  for str_hr in list1:
    if str_hr in list2:
      hr += 1
  return (hr / len(list1))

def parse_rules(input_list, top_n):
  parsed_rules = []
  for rule in input_list:
    for token in rule.split():
      if (token not in parsed_rules) and token.isalpha():
        parsed_rules.append(token)
      if len(parsed_rules) == top_n:
        return parsed_rules
  return parsed_rules


data_list = ['openstack', 'qt']

f_write = open('/content/gdrive/MyDrive/XDP/out/pe_lr_vs_rf.csv', 'w')
csv_writer = csv.writer(f_write)
csv_writer.writerow(['dataset',
                     'hit_rate',
                     'rank_diff',
                     'total',
                     'base_shape'])

for data_files in data_list:
  input_df =  pd.read_csv('/content/gdrive/MyDrive/XDP/dataset/' + data_files + '.csv')
  input_df = input_df.drop(['commit_id', 'author_date'], axis=1)
  n_fold = 10
  kfold = KFold(n_splits=n_fold, shuffle=True, random_state=1)

  ########## KFold Loop ##########
  for train, test in kfold.split(input_df):
    train_data, test_data = input_df.loc[train], input_df.loc[test]
    #### Separate x and y for both train and test set ####
    train_x = train_data.drop(['bugcount', 'fixcount'], axis=1) 
    train_real_y = train_data['bugcount']
    train_real_y = train_real_y.astype('bool')
    feature_names = np.array(list(train_x.columns))

    test_x = test_data.drop(['bugcount', 'fixcount'], axis=1)
    test_real_y = test_data['bugcount']
    test_real_y = test_real_y.astype('bool')

    #### apply Spearman Corr based FS on baseline ####
    corr_mat = train_x.corr('spearman')
    corr_features = set()
    for cor_i in range(len(corr_mat.columns)):
      for cor_j in range(cor_i):
        if abs(corr_mat.iloc[cor_i, cor_j]) > 0.7:
          colname = corr_mat.columns[cor_i]
          corr_features.add(colname)
    
    train_x = train_x.drop(labels=corr_features, axis=1)
    test_x = test_x.drop(labels=corr_features, axis=1)

    # num_of_feature = len(train_x.columns)
    num_of_feature = 10

    #### Train Classifiers (Logistic Regression) ####
    lr = LogisticRegression(random_state=0, solver='liblinear')
    lr_pred = lr.fit(train_x, train_real_y).predict(test_x)

    rf = RandomForestClassifier(random_state=0)
    rf_pred = rf.fit(train_x, train_real_y).predict(test_x)

    #### PyExplainer ####
    lr_y_preds = pd.DataFrame(data={'bugcount': lr_pred}, index=test_real_y.index)
    rf_y_preds = pd.DataFrame(data={'bugcount': rf_pred}, index=test_real_y.index)

    lr_combined_testing_data = test_x.join(lr_y_preds)
    lr_combined_testing_data.reset_index(inplace=True)
    rf_combined_testing_data = test_x.join(rf_y_preds)
    rf_combined_testing_data.reset_index(inplace=True)

    lr_feature_cols = lr_combined_testing_data.iloc[:, 1:-1]
    lr_label_col = lr_combined_testing_data.iloc[:, -1]
    rf_feature_cols = rf_combined_testing_data.iloc[:, 1:-1]
    rf_label_col = rf_combined_testing_data.iloc[:, -1]

    # if nothing predicted as buggy in this fold skip this fold
    buggy_cnt = 0
    for i, row in lr_combined_testing_data.iterrows():
      if lr_combined_testing_data.loc[i]['bugcount'] == True and rf_combined_testing_data.loc[i]['bugcount'] == True:
        buggy_cnt += 1
    if buggy_cnt == 0:
      n_fold -= 1
      continue

    lr_py_explainer = pyexplainer_pyexplainer.PyExplainer(X_train=train_x,
                                                       y_train=train_real_y,
                                                       indep=train_x.columns,
                                                       dep='bugcount',
                                                       blackbox_model=lr)
    

    rf_py_explainer = pyexplainer_pyexplainer.PyExplainer(X_train=train_x,
                                                       y_train=train_real_y,
                                                       indep=train_x.columns,
                                                       dep='bugcount',
                                                       blackbox_model=rf)
    

    avg_hit_rate = 0
    avg_rank_diff = 0
    for i, row in lr_combined_testing_data.iterrows():
      if lr_combined_testing_data.loc[i]['bugcount'] == True and rf_combined_testing_data.loc[i]['bugcount'] == True:
          X_explain = lr_feature_cols.iloc[[i]]
          X_explain_rf = rf_feature_cols.iloc[[i]]
          y_explain = lr_label_col.iloc[[i]]
          y_explain_rf = rf_label_col.iloc[[i]]

          rules = lr_py_explainer.explain(X_explain=X_explain,
                                       y_explain=y_explain,
                                       search_function='crossoverinterpolation')
          rules_rf = rf_py_explainer.explain(X_explain=X_explain_rf,
                                             y_explain=y_explain_rf,
                                             search_function='crossoverinterpolation')

          # get hit rate and ranking difference PyExplaner
          rank_list_pe = parse_rules(rules['top_k_positive_rules']['rule'].tolist(), num_of_feature)
          rank_list_ds_pe = parse_rules(rules_rf['top_k_positive_rules']['rule'].tolist(), num_of_feature)

          if not bool(rank_list_pe) or not bool(rank_list_ds_pe):
            continue
          hit_rate = get_hit_rate(rank_list_pe, rank_list_ds_pe)
          rank_diff = get_rank_diff(rank_list_pe, rank_list_ds_pe)

          avg_hit_rate += hit_rate
          avg_rank_diff += rank_diff

    avg_hit_rate /= buggy_cnt
    avg_rank_diff /= buggy_cnt

    row = [data_files,
          avg_hit_rate,
          avg_rank_diff,
          buggy_cnt,
          train_x.shape]
    csv_writer.writerow(row)
    f_write.flush()
    print(row)
  ########## End of Bootstrap Loop ##########

f_write.close()
print('Done!')