In [8]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import shap
import matplotlib.pyplot as plt

from combat.pycombat import pycombat
from sklearn.model_selection import GroupShuffleSplit
from xgboost import XGBClassifier

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer, precision_score, recall_score, classification_report
from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from sklearn.model_selection import LearningCurveDisplay, learning_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score, make_scorer

from scipy import stats

import math
from sklearn.cluster import AgglomerativeClustering

import requests
import logging

In [9]:
os.getcwd()

'/app/script'

<h2>Preparazione dei dati</h2>

In [None]:
dataset = pd.read_csv('../Dataset/MergedDatasetFullCombat_symbol.csv')
sampleID = dataset['SampleID']
datasetID = dataset['SampleID'].apply(lambda x: x.split('-')[0]).values
indicator = dataset['Label']

def getPatientID(sampleID):
    return sampleID.split('-')[0] + '-' + sampleID.split('-')[1].split('_', 1)[1]

dataset.insert(1, 'PatientID', dataset['SampleID'].apply(getPatientID))
dataset

<h2>Analisi di correlazione</h2>

In [None]:
current = dataset.copy()
data = current.drop(['SampleID', 'PatientID', 'Label'], axis=1)
dataCorr = data.corr()

joblib.dump(dataCorr, '../Results/dataFullCorr.pkl')

In [None]:
dataCorr = joblib.load('../Results/dataFullCorr.pkl')
dataCorr

In [None]:
clusters = []
for i in range(0, dataCorr.shape[0]):
    cluster = []
    gene_i = dataCorr.index[i]
    print(i)
    for j in range(0, dataCorr.shape[1]):
        gene_j = dataCorr.columns[j]
        if abs(dataCorr.loc[gene_i, gene_j]) >= 0.9:
            cluster.append(gene_j)
    if len(cluster) > 1:
        clusters.append(cluster)

joblib.dump(clusters, "../Results/clustersFull.pkl")

In [None]:
clusters = joblib.load("../Results/clustersFull.pkl")
print(clusters)

gene_list = list(dataset.columns)
print(len(gene_list))

for cluster in clusters:
    for gene in cluster:
        if gene in gene_list:
            gene_list.remove(gene)

print(len(gene_list))

In [None]:
current = dataset.copy()
data = current.drop(['SampleID', 'PatientID', 'Label'], axis=1)
representative = {}

for cluster in clusters:
    representative[tuple(cluster)] = ""
    variances = {}
    for var in cluster:
        variances[var] = data[var].var()

    sorted_variances = sorted(variances.items(), key=lambda item: item[1])
    reprs = sorted_variances.pop()[0]
    if reprs not in representative.values():
        representative[tuple(cluster)] = reprs
    else:
        while reprs in representative.values() and sorted_variances != []:
            reprs = sorted_variances.pop()[0]
                
        representative[tuple(cluster)] = reprs

sorted_repr = dict(sorted(representative.items(), key=lambda x: len(x[0])))
cluster_sorted = list(sorted_repr.keys())
unique_clusters = {}
flag=True

for i in range(0, len(cluster_sorted)):
    for j in range(i+1, len(cluster_sorted)):
        if set(cluster_sorted[i]).issubset(set(cluster_sorted[j])):
            flag=False
            break
    if flag==True:
        unique_clusters[cluster_sorted[i]] = sorted_repr[cluster_sorted[i]]
    else:
        flag = True

print(unique_clusters)

In [None]:
for gene in list(unique_clusters.values()):
    if gene not in gene_list:
        gene_list.append(gene)

print(len(gene_list))

In [None]:
datasetDeclustered = dataset[gene_list]
# datasetDeclustered.to_csv(path+"Dataset/MergedDatasetFullCombatDeclustered_symbol.csv", index=False)
datasetDeclustered

<h2>Divisione in train e test</h2>

In [10]:
dataset = pd.read_csv('../Dataset/MergedDatasetFullCombatDeclustered_symbol.csv')

gruppi = dataset.groupby('PatientID')

def sanity_check(gruppi):
    for group_name, group_data in gruppi:
        if 'Control' in group_data['SampleID'].iloc[0]:
            for e in group_data['SampleID']:
                if not 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break
        else:
            for e in group_data['SampleID']:
                if 'Control' in e:
                    print("Errore in gruppo:", group_name)
                    break

sanity_check(gruppi)

splitter = GroupShuffleSplit(n_splits=2, test_size=0.25, random_state = 42)
split = splitter.split(dataset, groups=dataset['PatientID'])
train_inds, test_inds = next(split)

train = dataset.iloc[train_inds].sample(frac=1, random_state=42)
test = dataset.iloc[test_inds].sample(frac=1, random_state=42)

print("Dataset di train:")
print(train.shape)
print("I malati sono: ", sum(train['Label'] == 1))
print("I sani sono: ", sum(train['Label'] == 0))

print("\nDataset di test:")
print(test.shape)
print("I malati sono: ", sum(test['Label'] == 1))
print("I sani sono: ", sum(test['Label'] == 0))

y_train = train['Label']
x_train = train.drop(columns=['SampleID', 'Label', 'PatientID'])

y_test = test['Label']
x_test = test.drop(columns=['SampleID', 'Label', 'PatientID'])

Dataset di train:
(772, 6671)
I malati sono:  510
I sani sono:  262

Dataset di test:
(266, 6671)
I malati sono:  166
I sani sono:  100


<h1>Addestramento modello</h1>

In [11]:
def prettyPrint(model, name, test=False):
    print(name + ":")
    print("Iperparametri: ", model.best_params_)
    print("Train f1: ", model.score(x_train, y_train))
    print("Mean f1 cross-validated: ", model.best_score_)
    best_index = model.best_index_
    print("\t\t precision \t\t recall \t\t f1-score")
    print(f"0 \t\t {model.cv_results_['mean_test_precision 0'][best_index]:.2f} \t\t\t {model.cv_results_['mean_test_recall 0'][best_index]:.2f}") 
    print(f"1 \t\t {model.cv_results_['mean_test_precision 1'][best_index]:.2f} \t\t\t {model.cv_results_['mean_test_recall 1'][best_index]:.2f}")
    print(f"Accuracy \t\t\t\t\t\t\t {model.cv_results_['mean_test_Accuracy'][best_index]:.2f}")
    print(f"macro avg \t {(model.cv_results_['mean_test_precision 0'][best_index] + model.cv_results_['mean_test_precision 1'][best_index]) / 2:.2f} \t\t\t {(model.cv_results_['mean_test_recall 0'][best_index] + model.cv_results_['mean_test_recall 1'][best_index])/2:.2f} \t\t\t {model.cv_results_['mean_test_f1'][best_index]:.2f}")
    if test:  
        print("\nTest f1 score: ", model.score(x_test, y_test))
        print(classification_report(y_test, model.predict(x_test)), "\n\n")

def randomSearch(pipeline, hyperparameters, iteration, scorer, njobs, x_train, y_train):
    randomSearchResult=RandomizedSearchCV(
    pipeline, 
    param_distributions=hyperparameters,
    n_iter=iteration,
    scoring=scorer,
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=njobs,
    refit='f1'
    )

    randomSearchResult.fit(x_train, y_train)
    print("Best model:", randomSearchResult.best_params_)

    return randomSearchResult

def trainModel(pipeline, hyperparameters, scorer, njobs, x_train, y_train): 
    gridSearch = GridSearchCV(pipeline, param_grid=hyperparameters, cv=5, return_train_score=True, scoring=scorer, refit='f1', n_jobs=njobs, verbose=1, error_score='raise')
    gridSearch.fit(x_train, y_train)
    print("Best model:", gridSearch.best_params_)
    return gridSearch

def bayesianOpt(pipeline, hyperparameters, iteration, scorer, njobs, x_train, y_train):
    bayesianSearchResult = BayesSearchCV(estimator = pipeline, search_spaces=hyperparameters, cv=5, n_iter=iteration, return_train_score=True,  refit='f1', scoring=scorer, n_jobs=njobs, verbose=1)
    bayesianSearchResult.fit(x_train, y_train)
    print("Iperparametri:", bayesianSearchResult.best_params_)
    return bayesianSearchResult

In [12]:
pipeline = Pipeline(steps=[('scaling', MinMaxScaler()), ('smote', SMOTE(random_state=42, sampling_strategy=400/510, k_neighbors=5)), ('classifier', XGBClassifier(random_state=42))])

def precision_class_0(y_true, y_pred):
    return precision_score(y_true, y_pred, average=None)[0]

def precision_class_1(y_true, y_pred):
    return precision_score(y_true, y_pred, average=None)[1]

def recall_class_0(y_true, y_pred):
    return recall_score(y_true, y_pred, average=None)[0]

def recall_class_1(y_true, y_pred):
    return recall_score(y_true, y_pred, average=None)[1]

scorer = {
    'Accuracy': 'accuracy',
    'precision 0': make_scorer(precision_class_0),
    'precision 1': make_scorer(precision_class_1),
    'recall 0': make_scorer(recall_class_0),
    'recall 1': make_scorer(recall_class_1),
    'f1': make_scorer(f1_score, average='macro')
}

In [None]:
param_dist = {
    'classifier__n_estimators': np.linspace(50, 500, 7, dtype=int),  # Numero di alberi
    'classifier__max_depth': np.arange(2, 13),  # Profondità dell'albero
    'classifier__learning_rate': np.linspace(0.01, 0.7, 15),  # Tasso di apprendimento
    'classifier__gamma': [0, 0.1, 0.3, 0.5, 0.7, 1],  # Penalizzazione sulla complessità dell'albero
    'classifier__min_child_weight': [1, 2, 3, 4], 
    'classifier__scale_pos_weight': [1, 400/510],
    'classifier__reg_alpha': [0, 0.1, 0.5, 1, 5, 10],  # Regolarizzazione L1
    'classifier__reg_lambda': [0.1, 1, 10, 20, 50, 100]  # Regolarizzazione L2
}

randomSearchModel = randomSearch(pipeline, param_dist, 5000, scorer, 100, x_train, y_train)

# joblib.dump(randomSearchModel, "../store/randomSeachModelFull_new.pkl")

In [21]:
randomSearchModelNew = joblib.load("../store/randomSearchSingleModelFull_new.pkl")
prettyPrint(randomSearchModelNew, "Gradient Boosting with random search new")

Gradient Boosting with random search new:
Iperparametri:  {'classifier__scale_pos_weight': 0.7843137254901961, 'classifier__reg_lambda': 1, 'classifier__reg_alpha': 0, 'classifier__n_estimators': 200, 'classifier__min_child_weight': 1, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.20714285714285713, 'classifier__gamma': 0.1}
Train f1:  1.0
Mean f1 cross-validated:  0.7784030896953151
		 precision 		 recall 		 f1-score
0 		 0.77 			 0.63
1 		 0.83 			 0.90
Accuracy 							 0.81
macro avg 	 0.80 			 0.77 			 0.78


In [None]:
params = {
   "classifier__max_depth": [2, 3, 4, 5, 6],
    "classifier__n_estimators":[160, 180, 200, 220, 240, 260],
    "classifier__learning_rate": [0.15, 0.2, 0.25, 0.3],
    'classifier__scale_pos_weight': [1, 400/510],
    'classifier__reg_lambda': [1, 5, 10, 15, 20, 25],
    'classifier__reg_alpha': [0, 0.4, 0.8, 1.2],
    'classifier__gamma': [0]
}

gradientBoostingBasedOnRandom = trainModel(pipeline, params, scorer, 100, x_train, y_train)
# joblib.dump(gradientBoostingBasedOnRandom, "../store/gradientBoostingGridBasedOnRandom_new.pkl")

In [None]:
params2 = {
    "classifier__max_depth": [3, 4, 5, 6, 7],
    "classifier__n_estimators":[100, 130, 160, 190, 220],
    "classifier__learning_rate": [0.05, 0.1, 0.15, 0.2, 0.25],
    'classifier__scale_pos_weight': [400/510],
    'classifier__reg_lambda': [10, 15, 20, 25, 30],
    'classifier__reg_alpha': [0, 0.2, 0.4, 0.8, 1],
    'classifier__gamma': [0, 0.1]
}

gradientBoostingBasedOnRandom2 = trainModel(pipeline, params2, scorer, 100, x_train, y_train)
# joblib.dump(gradientBoostingBasedOnRandom2, "../store/gradientBoostingGridBasedOnRandom2_new.pkl")

Fitting 5 folds for each of 6250 candidates, totalling 31250 fits


In [None]:
params3 = {
    "classifier__max_depth": [4, 5, 6],
    "classifier__n_estimators":[130, 145, 160, 175, 190],
    "classifier__learning_rate": [0.08, 0.1, 0.13, 0.15],
    'classifier__scale_pos_weight': [400/510],
    'classifier__reg_lambda': [1, 10, 20, 30],
    'classifier__reg_alpha': [0, 0.2, 0.4, 0.6, 0.8],
    'classifier__gamma': [0, 0.1, 0.4]
}

gradientBoostingBasedOnRandom3 = trainModel(pipeline, params3, scorer, 100, x_train, y_train)
joblib.dump(gradientBoostingBasedOnRandom3, "../store/gradientBoostingGridBasedOnRandom3_new.pkl")

Fitting 5 folds for each of 7200 candidates, totalling 36000 fits


In [22]:
gradientBoostingBasedOnRandom = joblib.load("../store/gradientBoostingGridBasedOnRandom_new.pkl")
prettyPrint(gradientBoostingBasedOnRandom, "Gradient Boosting Grid 1")

gradientBoostingBasedOnRandom2 = joblib.load("../store/gradientBoostingGridBasedOnRandom2_new.pkl")
prettyPrint(gradientBoostingBasedOnRandom2, "\n\nGradient Boosting Grid 2")

gradientBoostingBasedOnRandom3 = joblib.load("../store/gradientBoostingGridBasedOnRandom3_new.pkl")
prettyPrint(gradientBoostingBasedOnRandom3, "\n\nGradient Boosting Grid 3")

Gradient Boosting Grid 1:
Iperparametri:  {'classifier__gamma': 0, 'classifier__learning_rate': 0.15, 'classifier__max_depth': 6, 'classifier__n_estimators': 160, 'classifier__reg_alpha': 0.4, 'classifier__reg_lambda': 1, 'classifier__scale_pos_weight': 0.7843137254901961}
Train f1:  1.0
Mean f1 cross-validated:  0.7845222232100724
		 precision 		 recall 		 f1-score
0 		 0.81 			 0.62
1 		 0.82 			 0.92
Accuracy 							 0.82
macro avg 	 0.81 			 0.77 			 0.78


Gradient Boosting Grid 2:
Iperparametri:  {'classifier__gamma': 0.1, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 6, 'classifier__n_estimators': 220, 'classifier__reg_alpha': 0.2, 'classifier__reg_lambda': 20, 'classifier__scale_pos_weight': 0.7843137254901961}
Train f1:  1.0
Mean f1 cross-validated:  0.7749279736084163
		 precision 		 recall 		 f1-score
0 		 0.79 			 0.61
1 		 0.82 			 0.92
Accuracy 							 0.81
macro avg 	 0.80 			 0.76 			 0.77


Gradient Boosting Grid 3:
Iperparametri:  {'classifier__gamma': 0,

Bayesian Hyperparameter Optimization

In [None]:
param_dist = {
    'classifier__n_estimators': Integer(50, 600),  # Numero di alberi
    'classifier__max_depth': Integer(2, 15),  # Profondità dell'albero
    'classifier__learning_rate': Real(0.001, 0.7, prior='log-uniform'),  # Tasso di apprendimento
    'classifier__gamma': Real(0.0001, 1, prior='log-uniform'),  # Minimum loss reduction
    'classifier__min_child_weight': Integer(1, 8), 
    'classifier__scale_pos_weight': Categorical([1, 400/510]),
    'classifier__reg_alpha': Real(0.0001, 100, prior='log-uniform'),  # Regolarizzazione L1
    'classifier__reg_lambda': Real(0.0001, 100, prior='log-uniform')  # Regolarizzazione L2
}

bayesianOptResult = bayesianOpt(pipeline, param_dist, 400, scorer, 10, x_train, y_train)
joblib.dump(bayesianOptResult, "../store/bayesianOptResult.pkl")

In [14]:
prettyPrint(bayesianOptResult, "Bayesian Hyperparameter")

Bayesian Hyperparameter:
Iperparametri:  OrderedDict([('classifier__gamma', 0.0001), ('classifier__learning_rate', 0.1758788979260332), ('classifier__max_depth', 2), ('classifier__min_child_weight', 1), ('classifier__n_estimators', 268), ('classifier__reg_alpha', 0.0011507871601593655), ('classifier__reg_lambda', 0.0001), ('classifier__scale_pos_weight', 0.7843137254901961)])
Train f1:  1.0
Mean f1 cross-validated:  0.7789211248960214
		 precision 		 recall 		 f1-score
0 		 0.75 			 0.66
1 		 0.84 			 0.88
Accuracy 							 0.81
macro avg 	 0.79 			 0.77 			 0.78


In [None]:
params = {
    "classifier__max_depth": [3, 4, 5],
    "classifier__n_estimators":[70, 100, 130],
    "classifier__learning_rate": [0.05, 0.1, 0.15],
    'classifier__reg_lambda': [10, 20, 30],
    'classifier__reg_alpha': [0.6, 0.8, 1],
    'classifier__gamma': [0.1, 0.5, 1, 5]
}

gradientBoostingGridIndependentSampling = trainModel(pipeline, params, scorer, 80, x_train, y_train)
joblib.dump(gradientBoostingGridIndependentSampling, "../store/gradientBoostingGridIndependent_newSampling.pkl")

In [None]:
gradientBoostingGridIndependent = joblib.load("../store/gradientBoostingGridIndependent_new.pkl")
prettyPrint(gradientBoostingGridIndependent, "Bayes Hyperparameter Optimization")

In [None]:
gradientBoostingGridIndependent = joblib.load("../store/gradientBoostingGridIndependent_newSampling.pkl")
prettyPrint(gradientBoostingGridIndependent, "Bayes Hyperparameter Optimization")