# RE19-classification: interpretable ML via RuleMatrix



## Install the necessary packages for rule-matrix



In [0]:
!git clone https://github.com/rulematrix/rule-matrix-py.git
!pip3 install rule-matrix-py/.
!pip3 install mdlp-discretization
!pip3 install pysbrl==0.4.2rc0
!pip3 install fim

fatal: destination path 'rule-matrix-py' already exists and is not an empty directory.
[31mDirectory 'rule-matrix-py/.' is not installable. Neither 'setup.py' nor 'pyproject.toml' found.[0m


## Imports

In [0]:
import rulematrix
from rulematrix.surrogate import rule_surrogate
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_iris

import pandas as pd

from sklearn.svm import SVC

import numpy as np


# Load the dataset

In [0]:
def drop_descriptive_columns(dataset):
    for c in dataset.columns:
        if c in ['RequirementText', 'Class', 'ProjectID']:
            dataset = dataset.drop(c, axis = 1)
    return dataset
  
def split_tr_te(dataset, target, to_drop):
     return train_test_split(dataset.drop(to_drop, axis=1), dataset[target], test_size=0.25, random_state=42)

def train_nn(neurons=(20,), **kwargs):
    model = MLPClassifier(hidden_layer_sizes=neurons, **kwargs)
    if is_categorical is not None:
        model = Pipeline([
            ('one_hot', OneHotEncoder(categorical_features=is_categorical)),
            ('mlp', model)
        ])
    model.fit(train_x, train_y)
    train_score = model.score(train_x, train_y)
    test_score = model.score(test_x, test_y)
    print('Training score:', train_score)
    print('Test score:', test_score)
    return model
  
def train_SVC(train_x, train_y):
  model = SVC(kernel='linear', C=1, random_state=0, probability=True)
  model.fit(train_x, train_y)
  train_score = model.predict(train_x)
  test_score = model.predict_proba(train_x)[:, 1]
#   scores_line = print_scores(train_y, pred_train, name)
#   print('Training score:', train_score)
#   print('Test score:', test_score)
  return model

def train_surrogate(model, sampling_rate=2.0, **kwargs):
  """
  trains rulematrix
  """
    surrogate = rule_surrogate(model.predict, train_x, sampling_rate=sampling_rate,
                               is_continuous=is_continuous,
                               is_categorical=is_categorical,
                               is_integer=is_integer,
                               rlargs={'feature_names': feature_names, 'verbose': 2},
                               **kwargs)

    train_fidelity = surrogate.score(train_x)
    test_fidelity = surrogate.score(test_x)
    print('Training fidelity:', train_fidelity)
    print('Test fidelity:', test_fidelity)
    return surrogate


folder_datasets = '../datasets/re19_ling_datasets/'

filenames = ['promise-reclass', 'INDcombined', '8combined']
targets = ['IsFunctional', 'IsQuality', 'OnlyFunctional', 'OnlyQuality']
feature_sets = ['sd', 'sdsb8sel02ext']
#UNCOMMENT AND SET THE FOLLOWING FOR THE VISUALIZATION (SEE COMMENT AT THE END OF THE FILE)
# target = 'IsFunctional'
# filename = 'promise-reclass-'
# appendix = 'ling-'+'allext'
# appendix = 'f'


for target in targets:
    for feature_set in feature_sets:
        for filename in filenames:
            print('======== Target '+target +' Feature set '+feature_set+' Dataset '+filename+ ' ========')
            appendix='-ling-'+feature_set

            data = pd.read_csv(folder_datasets+filename + appendix + '.csv', engine='python')

            data = drop_descriptive_columns(data)
            # data = data.drop(['RequirementText', 'Class', 'ProjectID'], axis = 1)
            data = data.drop(data.columns[0], axis=1)

            nunique = data.apply(pd.Series.nunique)
            cols_to_drop = nunique[nunique == 1].index
            data = data.drop(cols_to_drop, axis=1)

            to_drop = ['IsFunctional', 'IsQuality']
            if target == 'OnlyQuality':
                data['IsQuality'] = ~data['IsFunctional'] & data['IsQuality']
                target = 'IsQuality'

            if target == 'OnlyFunctional':
                data['IsFunctional'] = data['IsFunctional'] & ~data['IsQuality']
                target = 'IsFunctional'

            train_x, test_x, train_y, test_y = split_tr_te(data, target, to_drop)


            is_integer = []
            is_continuous = []
            is_categorical = []
            for d in data.drop(to_drop, axis=1).dtypes:
              if d == 'int64' or d == 'int32':
                is_integer.append(True)
              else:
                is_integer.append(False)
              if d == 'float64' or d == 'float32':
                is_continuous.append(True)
              else:
                is_continuous.append(False)

            feature_names = []
            for fn in data.drop(to_drop, axis=1).columns:
              if fn != target:
                feature_names.append(fn)

            target_names = data[target].unique()
            is_categorical = None
            #is_integer = None

            print (is_continuous, is_categorical, is_integer, feature_names, target_names)

            print (data.head())

            # neural network
            # nn = train_nn((20, 20, 20), random_state=43, max_iter=250)

            #svm
            svc = train_SVC(train_x, train_y)

            #train rulematrix
            surrogate = train_surrogate(svc, 4, seed=44)
            #determine the rules
            rl = surrogate.student
            print(rl)

            # UNCOMMENT THIS TO SEE THE VISUALIZATION.
            # NOTE: THE VISUALIZATION DOES NOT WORK WITHIN A LOOP,
            # SO TO USE IT, SET THE VARIABLES BEFORE THE LOOP AND REMOVE THE LOOP

            # rulematrix.render(train_x.values.astype('float64'), train_y.values.astype('float64'), surrogate,feature_names=feature_names, target_names=target_names, is_categorical=is_categorical)



[False, False, False, False, False, False, False, False, False, False, False, False, False, False] None [True, True, True, True, True, True, True, True, True, True, True, True, True, True] ['Length', 'dobj', 'nummod', 'acl', 'amod', 'auxpass', 'advmod', 'nsubjpass', 'nsubj', 'nmod', 'aux', 'pobj', 'prep', 'det'] [1 0]
   IsFunctional  IsQuality  Length  dobj  nummod  acl  amod  auxpass  advmod  \
0             1          1      56     1       1    0     0        0       0   
1             0          1      98     1       0    1     0        0       0   
2             0          1     158     1       1    0     1        0       0   
3             0          1     197     1       1    0     1        0       1   
4             0          1     203     0       1    0     1        0       0   

   nsubjpass  nsubj  nmod  aux  pobj  prep  det  
0          0      1     0    1     0     0    1  
1          0      1     0    1     1     1    1  
2          0      1     1    1     1     1    1  

LinAlgError: singular matrix