In [1]:
import pandas as pd
import os
import numpy as np

#model
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PowerTransformer
from sklearn.feature_selection import f_classif,mutual_info_classif,VarianceThreshold,SequentialFeatureSelector, SelectPercentile
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score, roc_auc_score

import optuna
from imblearn.over_sampling import SMOTE,ADASYN
import matplotlib.pyplot as plt

import pickle
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# clinica filtra indici

clinico_ich = pd.read_excel('DB ICH CLINICO.xlsx',index_col=0)
clinico_gemelli = pd.read_excel('Database def Gemelli.xlsx',index_col=0)

In [3]:
# ICH

epato_core_ich = pd.read_excel('fc_es_core.xlsx',index_col=0)

indexes = epato_core_ich.index.intersection(clinico_ich.index)
selected_cols = epato_core_ich.columns[32:-32]
selected_cols = selected_cols.tolist()

epato_core_ich = epato_core_ich[selected_cols].loc[indexes]
epato_ring_ich = pd.read_excel('fc_es_ring.xlsx',index_col=0)[selected_cols].loc[indexes]
portale_core_ich = pd.read_excel('fc_p_core.xlsx',index_col=0)[selected_cols].loc[indexes]
portale_ring_ich = pd.read_excel('fc_p_ring.xlsx',index_col=0)[selected_cols].loc[indexes]


epato_core_ich['TRG'] = [1 if x<=3 else 0 for x in clinico_ich.loc[indexes]['TRG']]
epato_ring_ich['TRG'] = [1 if x<=3 else 0 for x in clinico_ich.loc[indexes]['TRG']]
portale_core_ich['TRG'] = [1 if x<=3 else 0 for x in clinico_ich.loc[indexes]['TRG']]
portale_ring_ich['TRG'] = [1 if x<=3 else 0 for x in clinico_ich.loc[indexes]['TRG']]



# Gemelli


epato_core_gemelli = pd.read_excel('Features_DPI_core_FPUG+Outcome.xlsx',index_col=0).rename(columns= {'TRG GRUPPI':'TRG'})

indexes = epato_core_gemelli.index.intersection(clinico_gemelli.index)

epato_ring_gemelli = pd.read_excel('Features_DPI_ring_FPUG+Outcome.xlsx',index_col=0).rename(columns= {'TRG GRUPPI':'TRG'})
portale_core_gemelli = pd.read_excel('Features_Portale_core_FPUG+Outcome.xlsx',index_col=0).rename(columns= {'TRG GRUPPI':'TRG'})
portale_ring_gemelli = pd.read_excel('Features_Portale_ring_FPUG+Outcome.xlsx',index_col=0).rename(columns= {'TRG GRUPPI':'TRG'})


epato_core_gemelli = epato_core_gemelli.loc[indexes]
epato_ring_gemelli = epato_ring_gemelli.loc[indexes]
portale_core_gemelli = portale_core_gemelli.loc[indexes]
portale_ring_gemelli = portale_ring_gemelli.loc[indexes]


In [4]:
# ICH

pt_3t = pd.read_excel('fc_es_core.xlsx',index_col=0)
pt_3t_to_drop = pt_3t[pt_3t['Magnetic Field']==3].index



epato_core = pd.concat([epato_core_ich,epato_core_gemelli])
epato_ring = pd.concat([epato_ring_ich,epato_ring_gemelli])
portale_core = pd.concat([portale_core_ich,portale_core_gemelli])
portale_ring = pd.concat([portale_ring_ich,portale_ring_gemelli])

epato = epato_core.join(epato_ring.drop(['TRG'],axis = 1), rsuffix='_ring')
portale = portale_core.join(portale_ring.drop(['TRG'],axis = 1), rsuffix='_ring')

ring = epato_ring.join(portale_ring.drop(['TRG'],axis = 1), rsuffix='_portale')
core = epato_core.join(portale_core.drop(['TRG'],axis = 1), rsuffix='_portale')

totale = epato_core.join(epato_ring.drop(['TRG'],axis = 1), rsuffix='_ering')\
                   .join(portale_core.drop(['TRG'],axis = 1), rsuffix='_pcore')\
                   .join(portale_ring.drop(['TRG'],axis = 1), rsuffix='_pring')


In [5]:
from sklearn.base import BaseEstimator,TransformerMixin

class MyDecorrelator(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold):
        self.threshold = threshold
        self.correlated_columns = None

    def fit(self, X, y=None):
        correlated_features = set()  
        X = pd.DataFrame(X)
        corr_matrix = X.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
                    colname = corr_matrix.columns[i]  # getting the name of column
                    correlated_features.add(colname)
        self.correlated_features = correlated_features
        return self

    def transform(self, X, y=None, **kwargs):
        return (pd.DataFrame(X)).drop(labels=self.correlated_features, axis=1)

In [6]:
dataset = epato_core.copy()

# train-test-validation esterna
from sklearn.model_selection import RepeatedKFold
def dataset_split(dataset):
    #val_esterna = dataset[dataset['Center']==0]
    #train_test = dataset[dataset['Center']==1]

    #X = train_test.drop(['Center','TRG'],axis = 1)
    #y = train_test['TRG']

    #X_esterno = val_esterna.drop(['Center','TRG'],axis = 1)
    #y_esterno = val_esterna['TRG']
    
    dt = dataset.drop(['TRG'],axis = 1)
    X_interno, X_esterno, y_interno, y_esterno = train_test_split(dt, dataset['TRG'], test_size=0.33, random_state=42, stratify= dataset['TRG'])
    return X_interno,y_interno,X_esterno,y_esterno

# Optimization module

def objective(trial,X,y):

    
    # Standardization
    

    
    std = trial.suggest_categorical('standard', ['StandardScaler', 'MinMaxScaler', 'PowerTransformer'])
    
    if std=='StandardScaler':
        scaler = StandardScaler()
    elif std=='MinMaxScaler':
        scaler = MinMaxScaler()
    else:
        scaler = PowerTransformer()
    

    dec_perc = trial.suggest_float('dec_perc',0.8,0.99, step = 0.02)
    dec = MyDecorrelator(dec_perc)

    
    # Feature selection
    
    fs = trial.suggest_categorical('feature_selection', [  'SelectPercentile'])
    
    if fs=='VarianceThreshold':
        threshold_fs = trial.suggest_float('threshold_fs',0,1, step = 0.05)
        selector = VarianceThreshold(threshold_fs)

    elif fs=='SequentialFeatureSelector':
        estim = LogisticRegression(n_jobs=-1)
        n_features_to_select_fs = trial.suggest_int('n_features_to_select_fs',4,20)
        direction_fs = trial.suggest_categorical('direction_fs', ['forward'])
        selector = SequentialFeatureSelector(estim,
                                             n_features_to_select = n_features_to_select_fs,
                                             direction = direction_fs,scoring = 'roc_auc',n_jobs = -1)

    else:
        percentile_fs = trial.suggest_float('percentile_fs',5,15, step = 1)
        selector = SelectPercentile(mutual_info_classif,percentile = percentile_fs)

    
    
    # Model
    m = trial.suggest_categorical('model', ['SGDClassifier'])
    
    if m == 'SGDClassifier':
        loss_sgd = trial.suggest_float('loss_sgd',1e-2,1e2, step = 0.001)
        m = LogisticRegression(C=loss_sgd,solver = 'liblinear',class_weight='balanced')
    elif m == 'LinearSVC':
        m = LinearSVC(class_weight='balanced')
    elif m == 'AdaBoostClassifier':
        criterion_ada = trial.suggest_categorical('criterion_ada', ['gini', 'entropy'])
        max_depth_ada = trial.suggest_int('max_depth_ada',2,5)
        base_estimator = DecisionTreeClassifier(criterion = criterion_ada,
                                                max_depth=max_depth_ada,class_weight='balanced')
        n_estimators_ada = trial.suggest_int('n_estimators_ada',3,10)
        learning_rate_ada = trial.suggest_float('learning_rate_ada',0.1,10)
        m = AdaBoostClassifier(estimator = base_estimator, n_estimators=n_estimators_ada,
                               learning_rate=learning_rate_ada,random_state=4)
    elif m == 'RandomForestClassifier':
        n_estimators_rf =trial.suggest_int('n_estimators_rf',300,500)
        criterion=trial.suggest_categorical('criterion_rf', ['gini', 'entropy'])
        max_depth=trial.suggest_int('max_depth_rf',2,5)
        min_samples_split=trial.suggest_int('min_samples_split_rf',2,10)
        min_samples_leaf=trial.suggest_int('min_samples_leaf_rf',2,10)
        max_features=trial.suggest_float('max_features_rf',0.3,0.6)
        #ccp_alpha=trial.suggest_float('ccp_alpha_rf',1e-5,1)
        max_samples=trial.suggest_float('max_samples_rf',0.5,0.9)
        #print(n_estimators_rf)
        m = RandomForestClassifier(n_estimators=n_estimators_rf,
                                   criterion=criterion,
                                   max_depth=max_depth,
                                   #min_samples_leaf=min_samples_leaf,
                                   #min_samples_split = min_samples_split,
                                   max_features=max_features,
                                   #max_samples=max_samples,
                                   #class_weight='balanced_subsample',
                                   n_jobs=-1)
    else:
        m = GradientBoostingClassifier()
        
    cv = StratifiedKFold(5)
    cvc = CalibratedClassifierCV(m, cv=cv, method="sigmoid")
    pipe = Pipeline([('scaler',scaler),('decor',dec),('selector',selector),('model',m)])
    
    accuracy = cross_val_score(pipe,X,y, scoring='roc_auc',cv = RepeatedKFold(n_splits=3, n_repeats=2,), n_jobs=-1,error_score='raise')
    
    
    return np.mean(accuracy)

# 3. Create a study object and optimize the objective function.
def train(X,y):
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    func = lambda trial: objective(trial, X, y)
    study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler())
    study.optimize(func, n_trials=100)
    return study

In [7]:
def evaluate(best_params,X_interno,y_interno,X_esterno,y_esterno):

    std = best_params['standard']
    if std=='StandardScaler':
            scaler = StandardScaler()
    elif std=='MinMaxScaler':
            scaler = MinMaxScaler()
    else:
            scaler = PowerTransformer()


    dec = MyDecorrelator(best_params['dec_perc'])
        # Feature selection

    fs = best_params['feature_selection']

    if fs=='VarianceThreshold':
            threshold_fs = best_params['threshold_fs']
            selector = VarianceThreshold(threshold_fs)
    elif fs=='SequentialFeatureSelector':
            estim = LogisticRegression(n_jobs=-1)
            n_features_to_select_fs = best_params['n_features_to_select_fs']
            direction_fs = best_params['direction_fs']
            selector = SequentialFeatureSelector(estim,
                                                 n_features_to_select = n_features_to_select_fs,
                                                 direction = direction_fs,scoring = 'roc_auc',n_jobs = -1)

    else:
            percentile_fs = best_params['percentile_fs']
            selector = SelectPercentile(mutual_info_classif,percentile = percentile_fs)


        # Model
    m = best_params['model']

    if m == 'SGDClassifier':
            loss_sgd = best_params['loss_sgd']
            m = LogisticRegression(C=loss_sgd,solver = 'liblinear',class_weight='balanced',n_jobs=-1)
    elif m == 'LinearSVC':
            m = LinearSVC(class_weight='balanced')
    elif m == 'AdaBoostClassifier':
            base_estimator = DecisionTreeClassifier(criterion = best_params['criterion_ada'],
                                                    max_depth=best_params['max_depth_ada'])

            m = AdaBoostClassifier(estimator = base_estimator, n_estimators=best_params['n_estimators_ada'],
                                   learning_rate=best_params['learning_rate_ada'],random_state=4)
            #m = AdaBoostClassifier()
    elif m == 'RandomForestClassifier':
            
            m = RandomForestClassifier(n_estimators=best_params['n_estimators_rf'],
                                   criterion=best_params['criterion_rf'],
                                    max_depth=best_params['max_depth_rf'],
                                   #min_samples_leaf=best_params['min_samples_leaf_rf'],
                                   #min_samples_split = best_params['min_samples_split_rf'],
                                   max_features=best_params['max_features_rf'],
                                   #max_samples=best_params['max_samples_rf'],
                                   #ccp_alpha=best_params['ccp_alpha_rf'],
                                   #class_weight='balanced_subsample',
                                       n_jobs=-1)
    else:
            m = GradientBoostingClassifier()

    cv = StratifiedKFold(5)
    cvc = CalibratedClassifierCV(m, cv=cv, method="sigmoid")
    
    pipe = Pipeline([('scaler',scaler),('decor',dec),('selector',selector),('model',m)])
    
    
    
    
    cvc = pipe.fit(X_interno,y_interno)
    print('Train\n')

    y_pred_train = cvc.predict(X_interno)
    y_pred_proba_train = cvc.predict_proba(X_interno)

    
    acc = accuracy_score(y_interno,y_pred_train)
    sens = recall_score(y_interno,y_pred_train)
    spec = recall_score(y_interno,y_pred_train,pos_label=0)
    auc = roc_auc_score(y_interno,y_pred_proba_train[:,1])

    
    #scores = cross_val_score(pipe,X, y, cv=10,scoring='accuracy')
    #print("Accuracy: %0.2f (%0.2f-%0.2f)" % (scores.mean(), np.quantile(scores,q= 0.25),np.quantile(scores,q= 0.75)))

    
    print('Acc = ',round(acc,3),' Sens = ',round(sens,3), ' Spec = ',round(spec,3),' AUC = ',round(auc,3))
    print('\n\n')
    print('Test\n')

    y_pred_test = cvc.predict(X_esterno)
    y_pred_proba_test = cvc.predict_proba(X_esterno)
    acc = accuracy_score(y_esterno,y_pred_test)
    sens = recall_score(y_esterno,y_pred_test)
    spec = recall_score(y_esterno,y_pred_test,pos_label=0)
    auc = roc_auc_score(y_esterno,y_pred_proba_test[:,1])
    print('Acc = ',round(acc,3),' Sens = ',round(sens,3), ' Spec = ',round(spec,3),' AUC = ',round(auc,3))
    return cvc,m

In [10]:
list_datasets = [epato_core,epato_ring,portale_core,portale_ring, core, ring, epato, portale, totale]
dict_results = {}

for i, dataset in enumerate(list_datasets):
    if i == 0:
        t = 'Epato_core'
        
    elif i == 1:
        t = 'Epato_ring'
        
    elif i == 2:
        t = 'Portale_core'
        
    elif i == 3:
        t = 'Portale_ring'
        
    elif i == 4:
        t = 'Core totale'
        
    elif i == 5:
        t = 'Ring totale'
        
    elif i == 6:
        t = 'Epato totale'
        
    elif i == 7:
        t = 'Portale totale'
        
    else:
        t = 'Totale'
        
    if t=='Totale':
        
        print('-- Dataset Analysis: ',t)
        X_interno,y_interno,X_esterno,y_esterno = dataset_split(dataset)
        study = train(X_interno,y_interno)
        display(study.best_params)
        cvc, m = evaluate(study.best_params,X_interno,y_interno,X_esterno,y_esterno)
        s = pickle.dump(cvc,open(t+'_CalibratedClassifier_Radiomics.pkl', 'wb'))
        s = pickle.dump(cvc,open(t+'_Classifier_Radiomics.pkl', 'wb'))
        
        #dict_results[t] = res
        #print('    With DataAug')
        #sm = SMOTE(k_neighbors = 15,random_state=42,n_jobs=-1)
        #X,y = sm.fit_resample(X,y)
        #study = train()
        #display(study.best_params)
        #m = evaluate(study.best_params)
        print('\n ')

-- Dataset Analysis:  Totale




{'standard': 'MinMaxScaler',
 'dec_perc': 0.88,
 'feature_selection': 'SelectPercentile',
 'percentile_fs': 10.0,
 'model': 'SGDClassifier',
 'loss_sgd': 2.1039999999999996}

Train

Acc =  0.752  Sens =  0.717  Spec =  0.782  AUC =  0.806



Test

Acc =  0.686  Sens =  0.652  Spec =  0.714  AUC =  0.691

 


