In [None]:
###########import packages##########
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn import tree
from sklearn import ensemble
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneOut
import catboost
import xgboost
import lightgbm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import pickle

###########loading data##########
loo = LeaveOneOut()
seed=9239

In [None]:
###########wrapping classification metrics for later calls##########
def compute_classification_metrics(target, prediction):
    accuracy = accuracy_score(target, prediction)
    f1 = f1_score(target, prediction)
    auc = roc_auc_score(target, prediction)
    return accuracy, f1, auc

def gridsearch(model, param, algorithm_name, X_train, y_train, X_test, y_test):
    grid = GridSearchCV(model, param_grid=param, scoring='f1', cv=10, n_jobs=-1, verbose=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    prediction_train = best_model.predict(X_train)
    prediction_test = best_model.predict(X_test)
    
    # Combine train and test predictions and true values
    combined_predictions = np.concatenate([prediction_train, prediction_test])
    combined_true_values = np.concatenate([y_train, y_test])
    
    accuracy_train, f1_train, auc_train = compute_classification_metrics(y_train, prediction_train)
    accuracy_test, f1_test, auc_test = compute_classification_metrics(y_test, prediction_test)
    accuracy_all, f1_all, auc_all = compute_classification_metrics(combined_true_values, combined_predictions)
    
    print(algorithm_name)
    print('Best Classifier:', grid.best_params_)
    print('--- Training Data ---')
    print('Accuracy:', accuracy_train, 'F1:', f1_train, 'AUC:', auc_train)
    print('--- Test Data ---')
    print('Accuracy:', accuracy_test, 'F1:', f1_test, 'AUC:', auc_test)
    print('--- All Data ---')
    print('Accuracy:', accuracy_all, 'F1:', f1_all, 'AUC:', auc_all)
    
    return best_model, (accuracy_test+f1_test+auc_test)/3


In [None]:
fl = open(r'./database_full_ac.pkl', 'rb')
database_full = pickle.load(fl)
##54 AC 55 ST
data_input_full = database_full.iloc[:, 0:54]

# Convert the target variable to binary based on the threshold
threshold = 250
data_output_full = (database_full.iloc[:, 54] < threshold).astype(int)

X_train, X_test, y_train, y_test = train_test_split(data_input_full, data_output_full, test_size=0.1, random_state=seed, stratify=data_output_full)




In [None]:
model_RandomForestClassifier = RandomForestClassifier(random_state=1, verbose=0)
###########defining the parameters dictionary##########
param_RF= {
'criterion': ['gini'], 'max_depth': [None], 'max_features': ['log2'], 'n_estimators': [200]}
RF_full,RF_full_score=gridsearch(model_RandomForestClassifier, param_RF, 'Random Forest', X_train, y_train, X_test, y_test)

In [None]:
model_LGBMClassifier=LGBMClassifier(random_state=1,verbose=0)
param_lgbm = {
'boosting_type': ['gbdt'], 'learning_rate': [0.18], 'max_depth': [11], 'n_estimators': [200], 'reg_alpha': [0.0001], 'reg_lambda': [0.001], 'subsample': [0.4]}
LGBM_full,LGBM_full_score=gridsearch(model_LGBMClassifier,param_lgbm,'LightGBM',X_train,y_train,X_test,y_test)

In [None]:
###########RandomForest gridsearch CV for best hyperparameter##########
model_XGClassifier = XGBClassifier(random_state=1)
###########defining the parameters dictionary##########
param_XG = {
'booster': ['gbtree'], 'learning_rate': [0.08], 'max_depth': [11], 'n_estimators': [200], 'reg_alpha': [1e-05], 'reg_lambda': [0], 'subsample': [0.7]}
XG_full,XG_full_score=gridsearch(model_XGClassifier,param_XG,'XGBoost',X_train,y_train,X_test,y_test)

In [None]:
import math
from pdpbox.pdp_calc_utils import _calc_ice_lines_inter
from pdpbox.pdp import pdp_isolate, PDPInteract
from pdpbox.utils import (_check_model, _check_dataset, _check_percentile_range, _check_feature,
                    _check_grid_type, _check_memory_limit, _make_list,
                    _calc_memory_usage, _get_grids, _get_grid_combos, _check_classes)
from joblib import Parallel, delayed

def pdp_multi_interact(model, dataset, model_features, features, 
                    num_grid_points=None, grid_types=None, percentile_ranges=None, grid_ranges=None, cust_grid_points=None, 
                    cust_grid_combos=None, use_custom_grid_combos=False,
                    memory_limit=0.9, n_jobs=8, predict_kwds=None, data_transformer=None):

    def _expand_default(x, default, length):
        if x is None:
            return [default] * length
        return x

    def _get_grid_combos(feature_grids, feature_types):
        grids = [np.array(list(feature_grid),dtype=np.float16) for feature_grid in feature_grids]
        for i in range(len(feature_types)):
            if feature_types[i] == 'onehot':
                grids[i] = np.eye(len(grids[i])).astype(int).tolist()
        return np.stack(np.meshgrid(*grids,copy=bool), -1).reshape(-1, len(grids))

    if predict_kwds is None:
        predict_kwds = dict()

    nr_feats = len(features)

    # check function inputs
    n_classes, predict = _check_model(model=model)
    _check_dataset(df=dataset)
    _dataset = dataset.copy()

    # prepare the grid
    pdp_isolate_outs = []
    if use_custom_grid_combos:
        grid_combos = cust_grid_combos
        feature_grids = []
        feature_types = []
    else:
        num_grid_points = _expand_default(x=num_grid_points, default=10, length=nr_feats)
        grid_types = _expand_default(x=grid_types, default='percentile', length=nr_feats)
        for i in range(nr_feats):
            _check_grid_type(grid_type=grid_types[i])

        percentile_ranges = _expand_default(x=percentile_ranges, default=None, length=nr_feats)
        for i in range(nr_feats):
            _check_percentile_range(percentile_range=percentile_ranges[i])

        grid_ranges = _expand_default(x=grid_ranges, default=None, length=nr_feats)
        cust_grid_points = _expand_default(x=cust_grid_points, default=None, length=nr_feats)

        _check_memory_limit(memory_limit=memory_limit)

        pdp_isolate_outs = []
        for idx in range(nr_feats):
            pdp_isolate_out = pdp_isolate(
                model=model, dataset=_dataset, model_features=model_features, feature=features[idx],
                num_grid_points=num_grid_points[idx], grid_type=grid_types[idx], percentile_range=percentile_ranges[idx],
                grid_range=grid_ranges[idx], cust_grid_points=cust_grid_points[idx], memory_limit=memory_limit,
                n_jobs=n_jobs, predict_kwds=predict_kwds, data_transformer=data_transformer)
            pdp_isolate_outs.append(pdp_isolate_out)

        if n_classes > 2:
            feature_grids = [pdp_isolate_outs[i][0].feature_grids for i in range(nr_feats)]
            feature_types = [pdp_isolate_outs[i][0].feature_type  for i in range(nr_feats)]
        else:
            feature_grids = [pdp_isolate_outs[i].feature_grids for i in range(nr_feats)]
            feature_types = [pdp_isolate_outs[i].feature_type  for i in range(nr_feats)]

        grid_combos = _get_grid_combos(feature_grids, feature_types)

    feature_list = []
    for i in range(nr_feats):
        feature_list.extend(_make_list(features[i]))

    # Parallel calculate ICE lines
    true_n_jobs = _calc_memory_usage(
        df=_dataset, total_units=len(grid_combos), n_jobs=n_jobs, memory_limit=memory_limit)

    grid_results = Parallel(n_jobs=true_n_jobs)(delayed(_calc_ice_lines_inter)(
        grid_combo, data=_dataset, model=model, model_features=model_features, n_classes=n_classes,
        feature_list=feature_list, predict_kwds=predict_kwds, data_transformer=data_transformer)
                                                for grid_combo in grid_combos)

    ice_lines = pd.concat(grid_results, axis=0).reset_index(drop=True)
    pdp = ice_lines.groupby(feature_list, as_index=False).mean()

    # combine the final results
    pdp_interact_params = {'n_classes': n_classes, 
                        'features': features, 
                        'feature_types': feature_types,
                        'feature_grids': feature_grids}
    if n_classes > 2:
        pdp_interact_out = []
        for n_class in range(n_classes):
            _pdp = pdp[feature_list + ['class_%d_preds' % n_class]].rename(
                columns={'class_%d_preds' % n_class: 'preds'})
            pdp_interact_out.append(
                PDPInteract(which_class=n_class,
                            pdp_isolate_outs=[pdp_isolate_outs[i][n_class] for i in range(nr_feats)],
                            pdp=_pdp, **pdp_interact_params))
    else:
        pdp_interact_out = PDPInteract(
            which_class=None, pdp_isolate_outs=pdp_isolate_outs, pdp=pdp, **pdp_interact_params)

    return pdp_interact_out
print('ready')

In [None]:
def center(arr): return arr - np.mean(arr)
import itertools
def compute_f_vals(mdl, X, features, selectedfeatures, num_grid_points=10, use_data_grid=False):
    f_vals = {}
    data_grid = None
    if use_data_grid:
        data_grid = X[selectedfeatures].values
    # Calculate partial dependencies for full feature set
    p_full = pdp_multi_interact(mdl, X, features, selectedfeatures, 
                                num_grid_points=[num_grid_points] * len(selectedfeatures),
                                cust_grid_combos=data_grid,
                                use_custom_grid_combos=use_data_grid)
    f_vals[tuple(selectedfeatures)] = center(p_full.pdp.preds.values)
    grid = p_full.pdp.drop('preds', axis=1)
    # Calculate partial dependencies for [1..SFL-1]
    for n in range(1, len(selectedfeatures)):
        for subsetfeatures in itertools.combinations(selectedfeatures, n):
            if use_data_grid:
                data_grid = X[list(subsetfeatures)].values
            p_partial = pdp_multi_interact(mdl, X, features, subsetfeatures, 
                                        num_grid_points=[num_grid_points] * len(selectedfeatures),
                                        cust_grid_combos=data_grid,
                                        use_custom_grid_combos=use_data_grid)
            p_joined = pd.merge(grid, p_partial.pdp, how='left')
            f_vals[tuple(subsetfeatures)] = center(p_joined.preds.values)
    return f_vals
def compute_h_val(f_vals, selectedfeatures):
    denom_els = f_vals[tuple(selectedfeatures)].copy()
    numer_els = f_vals[tuple(selectedfeatures)].copy()
    sign = -1.0
    for n in range(len(selectedfeatures)-1, 0, -1):
        for subfeatures in itertools.combinations(selectedfeatures, n):
            print(tuple(subfeatures))
            numer_els += sign * f_vals[tuple(subfeatures)]
        sign *= -1.0
    numer = np.sum(numer_els**2)
    denom = np.sum(denom_els**2)
    return math.sqrt(numer/denom) if numer < denom else np.nan
def compute_h_val_any(f_vals, allfeatures, selectedfeature):
    otherfeatures = list(allfeatures)
    otherfeatures.remove(selectedfeature)
    denom_els = f_vals[tuple(allfeatures)].copy()
    numer_els = denom_els.copy()
    numer_els -= f_vals[(selectedfeature,)]
    numer_els -= f_vals[tuple(otherfeatures)]
    numer = np.sum(numer_els**2)
    denom = np.sum(denom_els**2)
    return math.sqrt(numer/denom) if numer < denom else np.nan
def compute_interactions(model,X_train,feature_all,feature_select_list):  
    result_dict={}
    for i in range(len(feature_select_list)):
        for j in range(len(feature_select_list)):
            if i<j :
                print(i,j)
                try:
                    current_features=[feature_select_list[i],feature_select_list[j]]
                    f_vals=compute_f_vals(model, X_train, feature_all,current_features) 
                    result_dict[tuple(current_features)]=compute_h_val(f_vals,current_features)
                except:
                    result_dict[tuple(current_features)]=0
                print(result_dict[tuple(current_features)])
    return result_dict

In [None]:
RF_DICT=compute_interactions(RF_full,data_input_full,data_input_full.columns,list(data_input_full.columns))

In [None]:
LGBM_DICT=compute_interactions(LGBM_full,data_input_full,data_input_full.columns,list(data_input_full.columns))

In [None]:
XG_DICT=compute_interactions(XG_full,data_input_full,data_input_full.columns,list(data_input_full.columns))

In [None]:
def construct_matrix_weighted(target_dict,target_score):
    df=pd.DataFrame(columns=data_input_full.columns,index=data_input_full.columns)
    for each in target_dict:
        df.loc[each[0],each[1]]=target_dict[each]*target_score
        df.loc[each[1],each[0]]=target_dict[each]*target_score
    return df

In [None]:
RF_DF=construct_matrix_weighted(RF_DICT,RF_full_score)
XG_DF=construct_matrix_weighted(XG_DICT,XG_full_score)
LGBM_DF=construct_matrix_weighted(LGBM_DICT,LGBM_full_score)

In [None]:
LGBM_DF.to_csv('INTERACTION_LGBM_FULL.csv')
RF_DF.to_csv('INTERACTION_RF_FULL.csv')
XG_DF.to_csv('INTERACTION_XG_FULL.csv')

In [None]:
LGBM_DF=pd.read_csv('INTERACTION_LGBM_FULL.csv',index_col=0)
RF_DF=pd.read_csv('INTERACTION_RF_FULL.csv',index_col=0)
XG_DF=pd.read_csv('INTERACTION_XG_FULL.csv',index_col=0)

In [None]:
Weighted_Matrix=(XG_DF+RF_DF+LGBM_DF)/(XG_full_score+RF_full_score+LGBM_full_score)
Weighted_Matrix=Weighted_Matrix.fillna(0)
Weighted_Matrix=Weighted_Matrix/Weighted_Matrix.max().max()

In [None]:
Weighted_Matrix.to_csv('INTERACTION_FULL_WEIGHTED.csv')

In [None]:
import seaborn as sns

f, ax= plt.subplots(figsize = (16, 16))
sns.set(font_scale=1)
ax=sns.heatmap(Weighted_Matrix,annot=False, vmax=1,vmin = 0, xticklabels= True, yticklabels= True, square=True, cmap="gist_heat_r")

In [None]:
Element_M=Weighted_Matrix.iloc[0:36,0:36]
Synthesis_M=Weighted_Matrix.iloc[36:55,36:55]

In [None]:
f1, ax1= plt.subplots(figsize = (16, 16))
# sns.set(font_scale=2)
ax1=sns.heatmap(Element_M,annot=False, vmax=1,vmin = 0, xticklabels= True, yticklabels= True, square=True, cmap="gist_heat_r")

In [None]:
f2, ax2= plt.subplots(figsize = (16, 16))
# sns.set(font_scale=2)
ax2=sns.heatmap(Synthesis_M,annot=False, vmax=1,vmin = 0, xticklabels= True, yticklabels= True, square=True, cmap="gist_heat_r")