In [3]:
import os
import sys
import pandas as pd
import numpy as np
import warnings
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler

# Configurações globais
def disable_warnings():
    if not sys.warnoptions:
        warnings.simplefilter("ignore")
disable_warnings()

In [58]:
# Configuração de caminhos de arquivos
data_path = "C:/Users/renan/Desktop/UFRGS/Mestrado/GNN/data/data_final_v2"
features_file = os.path.join(data_path, "IREF_features_omics.tsv")
labels_file = os.path.join(data_path, "IREF_labels_semisupervised.tsv")

# Carregar dados
features = pd.read_csv(features_file, sep='\t', index_col='gene')
labels = pd.read_csv(labels_file, sep='\t', index_col='gene')

# Processamento dos dados
labels["label"].replace({False: 0, True: 1}, inplace=True)

In [None]:
# Definição dos grupos de features
groups = {
    "CNA": list(range(0, 16)),
    "Gene_Expression": list(range(16, 32)),
    "DNA_Methylation": list(range(32, 48)),
    "Mutations": list(range(48, 64)),
    "Multiomics": list(range(0, 64))
}

def select_features(features, selected_group):
    if selected_group in groups:
        return features.iloc[:, groups[selected_group]]
    else:
        raise ValueError(f"Grupo de features inválido: {selected_group}")

selected_group = "Mutations"  # Modifique conforme necessário
features = select_features(features, selected_group)

features

Unnamed: 0_level_0,MF: BLCA,MF: BRCA,MF: CESC,MF: COAD,MF: ESCA,MF: HNSC,MF: KIRC,MF: KIRP,MF: LIHC,MF: LUAD,MF: LUSC,MF: PRAD,MF: READ,MF: STAD,MF: THCA,MF: UCEC
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A1BG,0.000000,0.010629,0.000000,0.005446,0.000000,0.005403,0.012838,0.040970,0.050114,0.000000,0.000000,0.022774,0.028288,0.013054,0.000000,0.007712
A1CF,0.033728,0.031240,0.021465,0.000000,0.000000,0.005363,0.012698,0.000000,0.024672,0.117901,0.000000,0.090341,0.027762,0.011448,0.000000,0.010251
A2M,0.036284,0.010608,0.000000,0.016204,0.027028,0.016204,0.012836,0.040877,0.000000,0.122328,0.109769,0.022773,0.000000,0.038998,0.003601,0.019196
AAAS,0.000000,0.005306,0.000000,0.000000,0.000000,0.016425,0.025661,0.040872,0.000000,0.000000,0.000000,0.000000,0.000000,0.012746,0.003604,0.003829
AACS,0.000000,0.000000,0.000000,0.000000,0.000000,0.016118,0.000000,0.000000,0.000000,0.020504,0.000000,0.000000,0.028397,0.038616,0.003602,0.011557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.000000,0.005189,0.000000,0.015970,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007645
ZYG11B,0.000000,0.000000,0.000000,0.010832,0.000000,0.000000,0.012818,0.081662,0.025013,0.000000,0.000000,0.000000,0.000000,0.038673,0.000000,0.000000
ZYX,0.000000,0.000000,0.000000,0.010835,0.000000,0.000000,0.012815,0.000000,0.000000,0.020495,0.000000,0.000000,0.000000,0.012873,0.000000,0.022981
ZZEF1,0.178783,0.010602,0.000000,0.010820,0.027013,0.016445,0.038417,0.204110,0.000000,0.059756,0.000000,0.068216,0.028239,0.050702,0.003603,0.041296


In [60]:
dataset = features.merge(labels, left_index=True, right_index=True).dropna()

# Separação de variáveis
X = dataset.drop(columns=['label'])
y = dataset['label'].values

In [61]:
# Divisão entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Configuração de validação cruzada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision']

In [64]:
print("\nTesting SVM")

from sklearn.svm import SVC

clf = SVC()
clf_t = SVC(class_weight="balanced")

results = cross_validate(clf, X, y, cv=5, scoring=scoring, return_train_score=True)
results_t = cross_validate(clf_t, X, y, cv=5, scoring=scoring, return_train_score=True)

print('Train/Test Scores default\n')

print("Training accuracy scores", results['train_accuracy'])          
print("Test accuracy scores", results['test_accuracy'])
print("Test accuracy mean:", results['test_accuracy'].mean())

print("\nTraining precision scores", results['train_precision'])        
print("Test precision scores", results['test_precision'])
print("Test precision mean:", results['test_precision'].mean())

print("\nTraining recall scores", results['train_recall'])          
print("Test recall scores", results['test_recall'])
print("Test recall mean:", results['test_recall'].mean())

print("\nTraining auc_roc scores", results['train_roc_auc'])          
print("Test auc_roc scores", results['test_roc_auc'])
print("Test auc_roc mean:", results['test_roc_auc'].mean())

print("\nTraining auc_pr scores", results['train_average_precision'])          
print("Test auc_pr scores", results['test_average_precision'])
print("Test auc_pr mean:", results['test_average_precision'].mean())

#---------------------------------------------------------------------
print('\nTrain/Test Scores tuning\n')

print("Training accuracy scores", results_t['train_accuracy'])          
print("Test accuracy scores", results_t['test_accuracy'])
print("Test accuracy mean:", results_t['test_accuracy'].mean())

print("\nTraining precision scores", results_t['train_precision'])        
print("Test precision scores", results_t['test_precision'])
print("Test precision mean:", results_t['test_precision'].mean())

print("\nTraining recall scores", results_t['train_recall'])          
print("Test recall scores", results_t['test_recall'])
print("Test recall mean:", results_t['test_recall'].mean())

print("\nTraining auc_roc scores", results_t['train_roc_auc'])          
print("Test auc_roc scores", results_t['test_roc_auc'])
print("Test auc_roc mean:", results_t['test_roc_auc'].mean())

print("\nTraining auc_pr scores", results_t['train_average_precision'])          
print("Test auc_pr scores", results_t['test_average_precision'])
print("Test auc_pr mean:", results_t['test_average_precision'].mean())




Testing SVM
Train/Test Scores default

Training accuracy scores [0.92341357 0.92354228 0.92548263 0.92342342 0.92406692]
Test accuracy scores [0.91816778 0.91610911 0.90988671 0.91555098 0.91452111]
Test accuracy mean: 0.9148471393520694

Training precision scores [1. 1. 1. 1. 1.]
Test precision scores [0.94444444 0.73076923 0.48387097 0.78947368 0.69565217]
Test precision mean: 0.7288421002158361

Training recall scores [0.14878398 0.15021459 0.17285714 0.14878398 0.15593705]
Test recall scores [0.09714286 0.10857143 0.0862069  0.08571429 0.09142857]
Test recall mean: 0.0938128078817734

Training auc_roc scores [0.75548662 0.761391   0.74574348 0.75989195 0.76726859]
Test auc_roc scores [0.60492889 0.5432967  0.59075941 0.54412159 0.57420648]
Test auc_roc mean: 0.5714626180073521

Training auc_pr scores [0.54881231 0.55404313 0.54223936 0.56239579 0.5678939 ]
Test auc_pr scores [0.32604388 0.26779256 0.20701873 0.26952414 0.22880744]
Test auc_pr mean: 0.2598373513597549

Train/Test S

In [37]:
print("\nTesting RF")

from sklearn.ensemble import RandomForestClassifier as RF

clf = RF()
clf_t = RF(class_weight="balanced")

results = cross_validate(clf, X, y, cv=5, scoring=scoring, return_train_score=True)
results_t = cross_validate(clf_t, X, y, cv=5, scoring=scoring, return_train_score=True)

print('Train/Test Scores default\n')

print("Training accuracy scores", results['train_accuracy'])          
print("Test accuracy scores", results['test_accuracy'])
print("Test accuracy mean:", results['test_accuracy'].mean())

print("\nTraining precision scores", results['train_precision'])        
print("Test precision scores", results['test_precision'])
print("Test precision mean:", results['test_precision'].mean())

print("\nTraining recall scores", results['train_recall'])          
print("Test recall scores", results['test_recall'])
print("Test recall mean:", results['test_recall'].mean())

print("\nTraining auc_roc scores", results['train_roc_auc'])          
print("Test auc_roc scores", results['test_roc_auc'])
print("Test auc_roc mean:", results['test_roc_auc'].mean())

print("\nTraining auc_pr scores", results['train_average_precision'])          
print("Test auc_pr scores", results['test_average_precision'])
print("Test auc_pr mean:", results['test_average_precision'].mean())

#---------------------------------------------------------------------
print('\nTrain/Test Scores tuning\n')

print("Training accuracy scores", results_t['train_accuracy'])          
print("Test accuracy scores", results_t['test_accuracy'])
print("Test accuracy mean:", results_t['test_accuracy'].mean())

print("\nTraining precision scores", results_t['train_precision'])        
print("Test precision scores", results_t['test_precision'])
print("Test precision mean:", results_t['test_precision'].mean())

print("\nTraining recall scores", results_t['train_recall'])          
print("Test recall scores", results_t['test_recall'])
print("Test recall mean:", results_t['test_recall'].mean())

print("\nTraining auc_roc scores", results_t['train_roc_auc'])          
print("Test auc_roc scores", results_t['test_roc_auc'])
print("Test auc_roc mean:", results_t['test_roc_auc'].mean())

print("\nTraining auc_pr scores", results_t['train_average_precision'])          
print("Test auc_pr scores", results_t['test_average_precision'])
print("Test auc_pr mean:", results_t['test_average_precision'].mean())


Testing RF
Train/Test Scores default

Training accuracy scores [0.9953662  0.99420775 0.9957529  0.9951094  0.99485199]
Test accuracy scores [0.91765311 0.91250643 0.90988671 0.91606591 0.91349125]
Test accuracy mean: 0.9139206838777454

Training precision scores [1. 1. 1. 1. 1.]
Test precision scores [0.82608696 0.57575758 0.48717949 0.77272727 0.61290323]
Test precision mean: 0.6549309035985054

Training recall scores [0.94849785 0.93562232 0.95285714 0.94563662 0.94277539]
Test recall scores [0.10857143 0.10857143 0.1091954  0.09714286 0.10857143]
Test recall mean: 0.10641050903119868

Training auc_roc scores [0.98444373 0.98154264 0.98259982 0.98385475 0.98287551]
Test auc_roc scores [0.73559308 0.6936894  0.6826663  0.72924084 0.7011836 ]
Test auc_roc mean: 0.708474646607257

Training auc_pr scores [0.95967451 0.94986282 0.96170907 0.95740133 0.95507174]
Test auc_pr scores [0.36292622 0.27013294 0.22855944 0.34238288 0.260857  ]
Test auc_pr mean: 0.292971694050729

Train/Test Sco

In [35]:
print("\nTesting GBC")

from sklearn.ensemble import GradientBoostingClassifier as GBC

clf = GBC()
results = cross_validate(clf, X, y, cv=5, scoring=scoring, return_train_score=True)

print("Training accuracy scores", results['train_accuracy'])          
print("Test accuracy scores", results['test_accuracy'])
print("Test accuracy mean:", results['test_accuracy'].mean())

print("\nTraining precision scores", results['train_precision'])        
print("Test precision scores", results['test_precision'])
print("Test precision mean:", results['test_precision'].mean())

print("\nTraining recall scores", results['train_recall'])          
print("Test recall scores", results['test_recall'])
print("Test recall mean:", results['test_recall'].mean())

print("\nTraining auc_roc scores", results['train_roc_auc'])          
print("Test auc_roc scores", results['test_roc_auc'])
print("Test auc_roc mean:", results['test_roc_auc'].mean())

print("\nTraining auc_pr scores", results['train_average_precision'])          
print("Test auc_pr scores", results['test_average_precision'])
print("Test auc_pr mean:", results['test_average_precision'].mean())
      


Testing GBC
Training accuracy scores [0.92791865 0.93023555 0.93024453 0.92741313 0.93101673]
Test accuracy scores [0.92074112 0.91250643 0.90834192 0.91194645 0.91040165]
Test accuracy mean: 0.9127875131250951

Training precision scores [1.         0.98757764 0.99375    0.96551724 0.98802395]
Test precision scores [0.81818182 0.57142857 0.45       0.5625     0.51428571]
Test precision mean: 0.5832792207792208

Training recall scores [0.19885551 0.22746781 0.22714286 0.20028612 0.2360515 ]
Test recall scores [0.15428571 0.11428571 0.10344828 0.10285714 0.10285714]
Test recall mean: 0.11554679802955665

Training auc_roc scores [0.77960432 0.78407232 0.79876137 0.78675994 0.790382  ]
Test auc_roc scores [0.74802359 0.73874919 0.67129232 0.75227423 0.71225321]
Test auc_roc mean: 0.7245185113968489

Training auc_pr scores [0.50470148 0.51482006 0.53418869 0.51481985 0.53861064]
Test auc_pr scores [0.3774207  0.28809231 0.25397971 0.34131743 0.28318978]
Test auc_pr mean: 0.3087999848691183

In [36]:
print("\nTesting MLP")

from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(64,64), max_iter=300)
results = cross_validate(clf, X, y, cv=5, scoring=scoring, return_train_score=True)

print("Training accuracy scores", results['train_accuracy'])          
print("Test accuracy scores", results['test_accuracy'])
print("Test accuracy mean:", results['test_accuracy'].mean())

print("\nTraining precision scores", results['train_precision'])        
print("Test precision scores", results['test_precision'])
print("Test precision mean:", results['test_precision'].mean())

print("\nTraining recall scores", results['train_recall'])          
print("Test recall scores", results['test_recall'])
print("Test recall mean:", results['test_recall'].mean())

print("\nTraining auc_roc scores", results['train_roc_auc'])          
print("Test auc_roc scores", results['test_roc_auc'])
print("Test auc_roc mean:", results['test_roc_auc'].mean())

print("\nTraining auc_pr scores", results['train_average_precision'])          
print("Test auc_pr scores", results['test_average_precision'])
print("Test auc_pr mean:", results['test_average_precision'].mean())


Testing MLP
Training accuracy scores [0.91929463 0.92264127 0.92303732 0.92599743 0.92689833]
Test accuracy scores [0.91713845 0.9114771  0.90782698 0.91040165 0.91091658]
Test accuracy mean: 0.9115521508194672

Training precision scores [0.86       0.90833333 0.86428571 0.83695652 0.91194969]
Test precision scores [0.79166667 0.54285714 0.43589744 0.50943396 0.52941176]
Test precision mean: 0.5618533944782558

Training recall scores [0.1230329  0.15593705 0.17285714 0.22031474 0.2074392 ]
Test recall scores [0.10857143 0.10857143 0.09770115 0.15428571 0.10285714]
Test recall mean: 0.11439737274220033

Training auc_roc scores [0.73361764 0.75885373 0.75433876 0.75895359 0.76314447]
Test auc_roc scores [0.73938268 0.68329024 0.66669755 0.72100897 0.69038079]
Test auc_roc mean: 0.7001520455547487

Training auc_pr scores [0.36229735 0.4336355  0.40795923 0.43486298 0.45279006]
Test auc_pr scores [0.37159037 0.28201507 0.25021122 0.31854361 0.26820246]
Test auc_pr mean: 0.2981125466448067