# 1. Carpetas base e instalación de librerías

In [1]:
DATASET_FOLDER = '/content/drive/MyDrive/Lung_Dataset'
IMAGES_FOLDER = '/content/drive/MyDrive/Lung_Dataset/Imagenes'
IMAGES_NRRD_FOLDER = '/content/drive/MyDrive/Lung_Dataset/Imagenes_NRRD'
IMAGES_PRED_NRRD_FOLDER = '/content/drive/MyDrive/Lung_Dataset/Imagenes_Pred_NRRD'
SEGMENTATION_CODE_FOLDER = '/content/drive/MyDrive/Codigo_TFM/Lung/Codigo_segunda_entrega/Segmentacion'
MODEL_FOLDER = '/content/drive/MyDrive/Codigo_TFM/Lung/Codigo_segunda_entrega/Model'

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!pip install scikeras
!pip install SimpleITK
!pip install -r '/content/drive/MyDrive/Codigo_TFM/Lung/Codigo_segunda_entrega/Segmentacion/requirements.txt'
!pip install pydicom
!pip install pyplastimatch
!pip install dcm2niix
!pip install pydicom_seg
!pip install dicom2nifti
!pip install pyradiomics

Mounted at /content/drive
Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0
Collecting SimpleITK
  Downloading SimpleITK-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.3.1
Collecting pydicom>=1.3.0 (from -r /content/drive/MyDrive/Codigo_TFM/Lung/Codigo_segunda_entrega/Segmentacion/requirements.txt (line 7))
  Downloading pydicom-2.4.3-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting statannot>=0.2.3 (from -r /content/drive/MyDrive/Codigo_TFM/Lung/Codigo_segunda_entrega/Segmentacion/requirements.txt (line 16))
  Downloading statannot-0.2.3-py3-none-any.

# 2. Importación de módulos

In [3]:
#Librerías basicas
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import gzip
import csv
from PIL import Image
import random
import scikeras

#Librerías Keras y Tensorflow
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras import layers, models

#Librería Scikit-Learn para preprocesar datos
import sklearn
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import MinMaxScaler

#Librería Scikit-Learn para crear y entrenar modelos
from sklearn.model_selection import train_test_split, cross_validate, KFold, cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.utils import plot_model
from keras import regularizers

#Librería Scikit-Learn para optimizar hiperparámetros
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import StratifiedKFold

#Librerías para leer imágenes médicas y extraer características radiómicas
import SimpleITK as sitk

from radiomics import featureextractor
import radiomics
from radiomics import firstorder, glcm, glrlm, glszm, gldm, ngtdm, shape, shape2D

import pywt
import pydicom
from pydicom import dcmread
import pyplastimatch
import dcm2niix
import pydicom_seg
import dicom2nifti

#Librería importada para segmentar tumor
import sys
sys.path.insert(0,SEGMENTATION_CODE_FOLDER)
from TheDuneAI import ContourPilot as cp

# 3. Funciones para preprocesar los datos

## 3.1 Para los datos genómicos

In [4]:
def eliminaOutliers(df):
	# Usamos el algoritmo de envolvente elíptica para detectar los outliers. Eliminaremos un 10% de los valores anómalos
	enveliptica = EllipticEnvelope(contamination=0.10, support_fraction=0.8, random_state=42)
	df_outliers = enveliptica.fit_predict(df)
	df = df[df_outliers==1]
	return df

def eliminaAtributosCorrelacionados(df):
	matriz_corr = df.corr().abs()
	triang_superior = matriz_corr.where(np.triu(np.ones(matriz_corr.shape), k=1).astype(bool))
	columnas_correlacionadas = [column for column in triang_superior.columns if any(triang_superior[column] > 0.90)]
	df.drop(columnas_correlacionadas, axis=1, inplace=True)
	return df

def aplicaPCA(df):
	index_names = df.index.tolist()
	data_genoma = df.values
	pca = PCA(n_components=0.98)
	genoma_pca = pca.fit_transform(data_genoma)
	pca_df = pd.DataFrame(data=genoma_pca, index=index_names)

	return pca_df

In [5]:
def preprocesaDatosGenomicos(path_dataset):
	print("Preprocesando datos genómicos")
	path_genoma_txt = os.path.join(path_dataset, 'GSE103584_R01_NSCLC_RNAseq.txt.gz')
	df_genoma = pd.read_csv(gzip.open(path_genoma_txt), sep='\t', index_col=0).fillna(0)
	df_genoma = (df_genoma.T)
	#---------- PREPROCESADO DE LOS DATOS ----------#
	#---------- 1. Estimación de valores ausentes ----------#
	#Sustituimos los valores ausentes por la media
	df_genoma.fillna(df_genoma.mean(), inplace=True)
	#---------- 2. Normalización ----------#
	#Normalizamos los datos
	scaler = preprocessing.MinMaxScaler()
	pacientes_id = df_genoma.index.values
	df_genoma = pd.DataFrame(scaler.fit_transform(df_genoma),columns = df_genoma.columns, index = pacientes_id)

	#---------- 3. Reducción de dimensionalidad ----------#
	#Reducimos el número de columnas para evitar descompensar el dataset final
	display(df_genoma)
	df_genoma = aplicaPCA(df_genoma)

	#---------- 4. Tratamiento de outliers ------------#
	display(df_genoma)
	df_genoma = eliminaOutliers(df_genoma)
	display(df_genoma)
	return df_genoma

## 3.2 Para los los datos clínicos

In [6]:
def preprocesarFechas (df_pacientes):
	#Para las fechas eliminamos los labels Not Collected y las convertimos a valor numérico timestamp
	#Para Date of Recurrence
	df_pacientes["Date of Recurrence"].replace("Not Collected",np.nan, inplace=True)
	df_pacientes["Date of Recurrence"].fillna("5/5/1995", inplace=True)
	df_pacientes["Date of Recurrence"] = pd.to_datetime(df_pacientes['Date of Recurrence']).astype(int)

	#Para CT Date
	df_pacientes["CT Date"].replace("Not Collected",np.nan, inplace=True)
	df_pacientes["CT Date"].fillna("5/5/1995", inplace=True)
	df_pacientes["CT Date"] = pd.to_datetime(df_pacientes['CT Date']).astype(int)

	#Para PET Date
	df_pacientes["PET Date"].replace("Not Collected",np.nan, inplace=True)
	df_pacientes["PET Date"].fillna("5/5/1995", inplace=True)
	df_pacientes["PET Date"] = pd.to_datetime(df_pacientes['PET Date']).astype(int)

	#Para Date of Death
	df_pacientes["Date of Death"].replace("Not Collected",np.nan, inplace=True)
	df_pacientes["Date of Death"].fillna("5/5/1995", inplace=True)
	df_pacientes["Date of Death"] = pd.to_datetime(df_pacientes['Date of Death']).astype(int)

	#Para Date of Last Known Alive
	df_pacientes["Date of Last Known Alive"].replace("Not Collected",np.nan, inplace=True)
	df_pacientes["Date of Last Known Alive"].fillna("5/5/1995", inplace=True)
	df_pacientes["Date of Last Known Alive"] = pd.to_datetime(df_pacientes['Date of Last Known Alive']).astype(int)
	return df_pacientes

def preprocesaDatosClinicos(path_dataset):
	print("Preprocesando datos clínicos")
	path_csv = os.path.join(path_dataset, 'NSCLCR01Radiogenomic_DATA_LABELS_2018-05-22_1500-shifted.csv')
	df_pacientes = pd.read_csv(path_csv)
	df_pacientes.set_index('Case ID', inplace=True)
	display(df_pacientes)

	#----------PREPROCESADO DE LOS DATOS----------#
	#Los valores númericos que estan en String los convertimos a números
	df_pacientes["Weight (lbs)"] = df_pacientes["Weight (lbs)"].apply(pd.to_numeric, errors = 'coerce')
	df_pacientes["Pack Years"] = df_pacientes["Pack Years"].apply(pd.to_numeric, errors = 'coerce')

	#Preprocesamos las fechas
	df_pacientes = preprocesarFechas(df_pacientes)

	#3.Codificamos las variables categóricas en números y normalizamos las variables numéricas
	for column in df_pacientes:
		if (df_pacientes[column].dtype == np.float64) or (df_pacientes[column].dtype == np.int64):
			#Sustituimos los valores NaN (valores ausentes) por la media
			df_pacientes[column].fillna(df_pacientes[column].mean(), inplace=True)
			df_pacientes[column] = (df_pacientes[column] - df_pacientes[column].min())/(df_pacientes[column].max() - df_pacientes[column].min())
		else:
			df_pacientes = pd.get_dummies(data=df_pacientes, columns=[column], prefix=[column], prefix_sep= " | ")
	#Elimanos atributos correlacionados
	df_pacientes = eliminaAtributosCorrelacionados(df_pacientes)
	#Relacionamos las datos genómicos con los clínicos con un join
	df_pacientes.fillna(df_pacientes.mean(), inplace=True)
	display(df_pacientes)

	return df_pacientes

## 3.3. Para los datos radiómicos

### 3.3.1 Funciones para guardar las imágenes en NRRD

In [7]:
def contiene_valores(group):
    return ('CT' in group['Modality'].values) and ('SEG' in group['Modality'].values)

def obtenPathCorrecto(path):
	string_nslsc = r".\NSCLC Radiogenomics" + "\\"
	path_correcto = path.replace(string_nslsc, '').replace("\\","/")
	return path_correcto

def escribirCTNRRD(subject_id, path_nttd, path_ct_original):
    path_ct_original = os.path.join(IMAGES_FOLDER, path_ct_original)
    path_subject_nrrd = os.path.join(path_nttd, subject_id)

    if not os.path.exists(path_subject_nrrd):
        os.makedirs(path_subject_nrrd)

    path_ct_nrrd = os.path.join(path_subject_nrrd, "image.nrrd")
    if not os.path.exists(path_ct_nrrd):
        reader = sitk.ImageSeriesReader()
        dicom_names = reader.GetGDCMSeriesFileNames(path_ct_original)
        reader.SetFileNames(dicom_names)
        dicom_image = reader.Execute()
        sitk.WriteImage(dicom_image, path_ct_nrrd)

def escribirSegmentacionNRRD(subject_id, path_nttd, path_seg_original):
    path_seg_original = os.path.join(IMAGES_FOLDER, path_seg_original)
    archivo_seg = os.listdir(path_seg_original)[0]
    path_seg_original = os.path.join(path_seg_original, archivo_seg)

    path_subject_nrrd = os.path.join(path_nttd, subject_id)
    if not os.path.exists(path_subject_nrrd):
        os.makedirs(path_subject_nrrd)

    path_seg_nrrd = os.path.join(path_subject_nrrd, "mask.nrrd")
    if not os.path.exists(path_seg_nrrd):
        dcm_imagen_seg = pydicom.dcmread(path_seg_original)

        reader = pydicom_seg.MultiClassReader()
        result = reader.read(dcm_imagen_seg)

        image_data = result.data
        image = result.image
        sitk.WriteImage(image, path_seg_nrrd, True)

def guardarImagenesConSegmentacionNRRD(path_dataset):
	path_imagenes = os.path.join(path_dataset, 'Imagenes')

	df_metadata = pd.read_csv(os.path.join(path_dataset, 'metadata.csv'))
	df_metadata = df_metadata.reset_index()

	for subject_id in df_metadata['Subject ID'].unique():
		df_filtered_by_subject = df_metadata[df_metadata['Subject ID'] == subject_id]
		df_grouped_by_subject = df_filtered_by_subject.groupby('Subject ID')

		filtered = df_grouped_by_subject.filter(contiene_valores)
		if not filtered.empty:
			print("------ " + subject_id + " ------")
			#Filtramos el dataframe para solo obtener las filas que sean segmentaciones
			row_where_seg = df_filtered_by_subject[df_filtered_by_subject['Modality'] == 'SEG']
            #Obtenemos el valor del path a esa segmentacion
			path_mask = row_where_seg['File Location'].values[0]
			path_carpeta_general = path_mask.rsplit('\\', 1)[0]

            #Obtenemos el path a la imagen ct de la segmentación
			rows_where_ct = df_filtered_by_subject[df_filtered_by_subject['Modality'] == 'CT']
			row_where_ct = rows_where_ct[rows_where_ct['File Location'].str.startswith(path_carpeta_general)]
			path_ct = row_where_ct['File Location'].values[0]

			escribirCTNRRD(subject_id, IMAGES_NRRD_FOLDER, obtenPathCorrecto(path_ct))
			escribirSegmentacionNRRD(subject_id, IMAGES_NRRD_FOLDER, obtenPathCorrecto(path_mask))

def guardarImagenesSinSegmentacionNRRD(path_dataset):
	path_imagenes = os.path.join(path_dataset, 'Imagenes')

	df_metadata = pd.read_csv(os.path.join(path_dataset, 'metadata.csv'))
	df_metadata = df_metadata.reset_index()

	for subject_id in df_metadata['Subject ID'].unique():
		df_filtered_by_subject = df_metadata[df_metadata['Subject ID'] == subject_id]
		df_grouped_by_subject = df_filtered_by_subject.groupby('Subject ID')

		filtered = df_grouped_by_subject.filter(contiene_valores)
		if filtered.empty:
			print("NOT SEG ------ " + subject_id + " ------")

			rows_where_ct = df_filtered_by_subject[df_filtered_by_subject['Modality'] == 'CT']
			rows_with_no_pet = rows_where_ct[~rows_where_ct['File Location'].str.contains('PET')]
			max_num_files = rows_with_no_pet['Number of Images'].max()
			row_max_num_files = rows_with_no_pet[rows_with_no_pet['Number of Images'] == max_num_files]
			num_rows = row_max_num_files.shape
			num_rows, num_columns = row_max_num_files.shape
			if num_rows > 0:
				path_ct = row_max_num_files['File Location'].values[0]
				escribirCTNRRD(subject_id, IMAGES_PRED_NRRD_FOLDER, obtenPathCorrecto(path_ct))


### 3.3.2 Funciones para predecir las segmentaciones restantes

In [8]:
def predecirSegmentacionNRRD(path_dataset):
	path_imagenes = os.path.join(path_dataset, 'Imagenes')
	path_modelo = os.path.join(SEGMENTATION_CODE_FOLDER, 'Modelo')
	model = cp(path_modelo,IMAGES_PRED_NRRD_FOLDER,IMAGES_PRED_NRRD_FOLDER,verbosity=True)
	model.segment()

def guardarImagenesYPredecirSegmentacionNRRD(path_dataset):
	guardarImagenesSinSegmentacionNRRD(path_dataset)
	guardarImagenesConSegmentacionNRRD(path_dataset)
	predecirSegmentacionNRRD(path_dataset)

### 3.3.3 Funciones para extrar y guardar los datos radiómicos

In [9]:
def extraerDatosRadiomicosImagen(extractor, path_nrrd):
    path_nrrd_ct = os.path.join(path_nrrd, 'image.nrrd')
    path_nrrd_seg = os.path.join(path_nrrd, 'mask.nrrd')

    image_nrrd_ct = sitk.ReadImage(path_nrrd_ct)
    image_nrrd_seg = sitk.ReadImage(path_nrrd_seg)
    try:
        firstOrderFeatures = firstorder.RadiomicsFirstOrder(image_nrrd_ct, image_nrrd_seg)
        glcmFeatures = glcm.RadiomicsGLCM(image_nrrd_ct, image_nrrd_seg)
        glrlmFeatures = glrlm.RadiomicsGLRLM(image_nrrd_ct, image_nrrd_seg)
        ngtdmFeatures = ngtdm.RadiomicsNGTDM(image_nrrd_ct, image_nrrd_seg)
        gldmFeatures = gldm.RadiomicsGLDM(image_nrrd_ct, image_nrrd_seg)
    except Exception as e:
        return np.empty(0)
    else:
        firstOrderFeatures = np.array([v for _, v in firstOrderFeatures.execute().items()])
        glcmFeatures = np.array([v for _, v in glcmFeatures.execute().items()])
        glrlmFeatures = np.array([v for _, v in glrlmFeatures.execute().items()])
        ngtdmFeatures = np.array([v for _, v in ngtdmFeatures.execute().items()])
        gldmFeatures = np.array([v for _, v in gldmFeatures.execute().items()])
        caract_radiomics = np.hstack((firstOrderFeatures, glcmFeatures, glrlmFeatures, ngtdmFeatures, gldmFeatures))
        return caract_radiomics

def extraerDatosRadiomicosCarpeta(path_carpeta, path_caract_radiomicas_parciales):
    lista_caract_radiomicas = {}
    lista_sujetos = {}
    if os.path.exists(path_caract_radiomicas_parciales):
        df_radiomicas_parciales = pd.read_csv(path_caract_radiomicas_parciales, index_col=0, header=0)
        for index, row in df_radiomicas_parciales.iterrows():
            lista_caract_radiomicas[index] = row.values
        lista_sujetos = df_radiomicas_parciales.index.tolist()

    extractor = featureextractor.RadiomicsFeatureExtractor()

    for nombre_subcarpeta in os.listdir(path_carpeta):
        if nombre_subcarpeta not in lista_sujetos:
            print("   paciente " + nombre_subcarpeta)
            path_subcarpeta= os.path.join(path_carpeta, nombre_subcarpeta)
            datos_radiomicos_imagen = extraerDatosRadiomicosImagen(extractor, path_subcarpeta)

            if datos_radiomicos_imagen.size != 0:
                lista_caract_radiomicas[nombre_subcarpeta] = datos_radiomicos_imagen
                df_radiomicas_parciales = pd.DataFrame.from_dict(lista_caract_radiomicas, orient='index')
                df_radiomicas_parciales.fillna(0, inplace=True)
                df_radiomicas_parciales.index.name='Case ID'
                df_radiomicas_parciales.to_csv(path_caract_radiomicas_parciales, index=True, header=True)
    return lista_caract_radiomicas

def procesaDatosRadiomicos(lista_caract_radiomicas):
    df_imagenes = pd.DataFrame.from_dict(lista_caract_radiomicas, orient='index')
    df_imagenes.fillna(0, inplace=True)
    scaler = preprocessing.MinMaxScaler()
    pacientes_id = df_imagenes.index.values
    caract_imagenes = pd.DataFrame(scaler.fit_transform(df_imagenes),columns = df_imagenes.columns, index = pacientes_id)
    caract_imagenes.index.name='Case ID'
    return caract_imagenes

def extraerDatosRadiomicos(path_dataset):
    print("Extrayendo datos radiómicos...")

    path_caract_radiomicas_parciales = os.path.join(path_dataset, 'Caracteristicas_extraidas/datos_radiomicos_parciales.csv')

    lista_caract_radiomicas_seg_orig = extraerDatosRadiomicosCarpeta(IMAGES_NRRD_FOLDER, path_caract_radiomicas_parciales)
    lista_caract_radiomicas = extraerDatosRadiomicosCarpeta(IMAGES_PRED_NRRD_FOLDER, path_caract_radiomicas_parciales)

    return procesaDatosRadiomicos(lista_caract_radiomicas)


# 4. Extracción de las características

## 4.1 Guardar características

In [10]:
def preprocesaYGuardaCaracteristicas(path_dataset):
    #Si las características de los datos clínicos no habían sido extraidas y guardadas, las guardamos
    path_caract_clinicas = os.path.join(path_dataset, 'Caracteristicas_extraidas/datos_clinicos.csv')
    if not os.path.isfile(path_caract_clinicas):
        caract_clinicas = preprocesaDatosClinicos(path_dataset)
        caract_clinicas.to_csv(path_caract_clinicas, index=True, header=True)

    #Si las características de los datos genómicos no habían sido extraidas y guardadas, las guardamos
    path_caract_genomicas = os.path.join(path_dataset, 'Caracteristicas_extraidas/datos_genomicos.csv')
    if not os.path.isfile(path_caract_genomicas):
        caract_genomicas = preprocesaDatosGenomicos(path_dataset)
        caract_genomicas.to_csv(path_caract_genomicas, index=True, header=True)

    #Si las características de los datos radiómicos no habían sido extraidas y guardadas, las guardamos
    path_caract_radiomicas = os.path.join(path_dataset, 'Caracteristicas_extraidas/datos_radiomicos.csv')
    if not os.path.isfile(path_caract_radiomicas):
        caract_radiomicas = extraerDatosRadiomicos(path_dataset)
        caract_radiomicas.to_csv(path_caract_radiomicas, index=True, header=True)

## 4.2 Extraer características

In [11]:
def extraeCaracteristicasCompletas(path_caract_1, path_caract_2, path_caract_3):
    caract_1 = pd.read_csv(path_caract_1, index_col=0, header=0)
    caract_2 = pd.read_csv(path_caract_2, index_col=0, header=0)
    caract_3 = pd.read_csv(path_caract_3, index_col=0, header=0)

    #Concatenamos las características
    caract_merged = pd.merge(caract_1, caract_2, left_index=True, right_index=True, how="outer")
    caract = pd.merge(caract_merged, caract_3, left_index=True, right_index=True, how="outer")
    caract.fillna(caract.mean(), inplace=True)
    return caract

def extraeDosCaracteristicas(es_clinica, path_caract_1, path_caract_2):
    caract_1 = pd.read_csv(path_caract_1, index_col=0, header=0)
    caract_2 = pd.read_csv(path_caract_2, index_col=0, header=0)

    #Concatenamos las características
    caract = pd.merge(caract_1, caract_2, left_index=True, right_index=True, how="outer")
    caract.fillna(caract.mean(), inplace=True)
    if not es_clinica:
        path_caract_clinicas = path_caract_clinicas = os.path.join(DATASET_FOLDER, 'Caracteristicas_extraidas/datos_clinicos.csv')
        caract_clinicas = pd.read_csv(path_caract_clinicas, index_col=0, header=0)
        survival_status_col = caract_clinicas['Survival Status | Alive']
        df_survival = survival_status_col.to_frame(name='Survival Status | Alive')
        caract = pd.merge(caract, df_survival, left_index=True, right_index=True, how="inner")
    return caract

def extraeUnaCaracteristica(es_clinica, path_caract):
    caract = pd.read_csv(path_caract, index_col=0, header=0)
    if not es_clinica:
        path_caract_clinicas = os.path.join(DATASET_FOLDER, 'Caracteristicas_extraidas/datos_clinicos.csv')
        caract_clinicas = pd.read_csv(path_caract_clinicas, index_col=0, header=0)
        survival_status_col = caract_clinicas['Survival Status | Alive']
        df_survival = survival_status_col.to_frame(name='Survival Status | Alive')
        caract = pd.merge(caract, df_survival, left_index=True, right_index=True, how="inner")
    return caract

def extraeCaractSegunInput(clinicos, genomicos, radiomicos):
    path_caract_clinicas = os.path.join(DATASET_FOLDER, 'Caracteristicas_extraidas/datos_clinicos.csv')
    path_caract_genomicas = os.path.join(DATASET_FOLDER, 'Caracteristicas_extraidas/datos_genomicos.csv')
    path_caract_radiomicas = os.path.join(DATASET_FOLDER, 'Caracteristicas_extraidas/datos_radiomicos.csv')
    df_caract = {}
    if clinicos and genomicos and radiomicos:
        df_caract = extraeCaracteristicasCompletas(path_caract_clinicas, path_caract_genomicas, path_caract_radiomicas)
    elif clinicos and genomicos and not radiomicos:
        df_caract = extraeDosCaracteristicas(clinicos, path_caract_clinicas, path_caract_genomicas)
    elif clinicos and not genomicos and radiomicos:
        df_caract = extraeDosCaracteristicas(clinicos, path_caract_clinicas, path_caract_radiomicas)
    elif not clinicos and genomicos and radiomicos:
        df_caract = extraeDosCaracteristicas(clinicos, path_caract_genomicas, path_caract_radiomicas)
    elif clinicos and not genomicos and not radiomicos:
        df_caract = extraeUnaCaracteristica(clinicos, path_caract_clinicas)
    elif not clinicos and genomicos and not radiomicos:
        df_caract = extraeUnaCaracteristica(clinicos, path_caract_genomicas)
    elif not clinicos and not genomicos and radiomicos:
        df_caract = extraeUnaCaracteristica(clinicos, path_caract_radiomicas)
    df_caract = df_caract._get_numeric_data()
    return df_caract

# 5. Implementación modelo

## 5.1 Funciones auxiliares

In [30]:
#Función para mostrar curva de aprendizaje
def mostrarCurvaAprendizaje(H, num_epocas):
    plt.style.use("ggplot")
    plt.figure()
    #plt.plot(range(num_epocas), H.history["loss"], label="train_loss")
    #plt.plot(range(num_epocas), H.history["val_loss"], label="val_loss")
    plt.plot(range(num_epocas), H.history["accuracy"], label="train_acc")
    plt.plot(range(num_epocas), H.history["val_accuracy"], label="val_acc")
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.show()

#Función para obtener y mostrar métricas
def obtenMetricas(y_test, y_pred):
    binary_predictions = (y_pred > 0.5).astype(int)

    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    print('Métricas: {')
    print('     Accuracy:  %.4f' % accuracy)
    print('     Precision: %.4f' % precision)
    print('     Recall: %.4f' % recall)
    print('     F1:   %.4f' % f1)
    print('}')

#Función para optimizar los hiperparámetros del modelo de red neuronal creado
def optimizaHiperparametros(x_train, y_train):
    input_dim = x_train.shape[1]
    modelo = KerasClassifier(build_fn=creaModeloMultimodal, input_dim=input_dim, epochs=10, batch_size=32, dropout_rate=0.2,
                             neuronas_capa_1=512, neuronas_capa_2=512, neuronas_capa_3=256, learning_rate=0.001, verbose=0)
    parametros = {
        'neuronas_capa_1': [256, 512],
        'neuronas_capa_2': [256, 512],
        'neuronas_capa_3': [128, 256],
        'dropout_rate': [0.2, 0.3],
        'learning_rate': [0.001, 0.01],
        'batch_size': [16, 32],
        'epochs': [60, 80]
    }
    #Usamos la validación cruzada
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(estimator=modelo, param_grid=parametros, cv=kfold, scoring='accuracy')
    grid_result = grid.fit(x_train, y_train)
    print("Mejores parámetros encontrados: ", grid_result.best_params_)
    print("Mejor precisión encontrada: ", grid_result.best_score_)

def defineHiperparametrosSegunDatos():
    param_datos = {}
    param_datos["Datos clínicos"] = {'batch_size': 16, 'dropout_rate': 0.3, 'epochs': 60, 'learning_rate': 0.01, 'neuronas_capa_1': 256, 'neuronas_capa_2': 256, 'neuronas_capa_3': 256}
    param_datos["Datos genómicos"] = {'batch_size': 16, 'dropout_rate': 0.3, 'epochs': 80, 'learning_rate': 0.001, 'neuronas_capa_1': 256, 'neuronas_capa_2': 256, 'neuronas_capa_3': 128}
    param_datos["Datos radiómicos"] = {'batch_size': 32, 'dropout_rate': 0.2, 'epochs': 80, 'learning_rate': 0.01, 'neuronas_capa_1': 256, 'neuronas_capa_2': 256, 'neuronas_capa_3': 256}
    param_datos["Datos clínicos y genómicos"] = {'batch_size': 32, 'dropout_rate': 0.2, 'epochs': 60, 'learning_rate': 0.001, 'neuronas_capa_1': 256, 'neuronas_capa_2': 512, 'neuronas_capa_3': 128}
    param_datos["Datos clínicos y radiómicos"] = {'batch_size': 16, 'dropout_rate': 0.2, 'epochs': 80, 'learning_rate': 0.01, 'neuronas_capa_1': 512, 'neuronas_capa_2': 256, 'neuronas_capa_3': 128}
    param_datos["Datos genómicos y radiómicos"] = {'batch_size': 16, 'dropout_rate': 0.3, 'epochs': 80, 'learning_rate': 0.01, 'neuronas_capa_1': 512, 'neuronas_capa_2': 256, 'neuronas_capa_3': 256}
    param_datos["Datos clínicos, genómicos y radiómicos"] = {'batch_size': 32, 'dropout_rate': 0.2, 'epochs': 60, 'learning_rate': 0.01, 'neuronas_capa_1': 512, 'neuronas_capa_2': 512, 'neuronas_capa_3': 256}
    return param_datos

def obtenMetricasValTrainRedNeuronal(modelo, num_epocas, batch_size, x_train, x_val, y_train, y_val):
    H = modelo.fit(x_train, y_train, epochs=num_epocas, batch_size=batch_size, validation_data=(x_val, y_val), verbose=0)
    last_validation_accuracy = H.history['val_accuracy'][-1]
    last_train_accuracy = H.history['accuracy'][-1]
    print("Val_accuracy:", last_validation_accuracy)
    print("Train_accuracy:", last_train_accuracy)

def obtenerPrecisionValTrainConjuntoModelos(x, y, comb_datos, objetivo):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)
    conjunto_modelos = {}

    input_dim = x_train.shape[1]
    conjunto_modelos["Red neuronal propia"] = creaModeloMultimodal(input_dim, comb_datos['neuronas_capa_1'], comb_datos['neuronas_capa_2'], comb_datos['neuronas_capa_3'], comb_datos['dropout_rate'], comb_datos['learning_rate'])
    conjunto_modelos["Clasificador MLP"] = MLPClassifier(solver='adam', random_state=42)
    conjunto_modelos["Regresión logística"] = LogisticRegression(random_state=42)
    conjunto_modelos["K vecinos"] = KNeighborsClassifier(n_neighbors=6,  metric="euclidean")
    print("-----------------------")
    for i, modelo in conjunto_modelos.items():
        print("Modelo:  " + i)
        if i == "Red neuronal propia":
            y_pred = obtenMetricasValTrainRedNeuronal(modelo, comb_datos['epochs'], comb_datos['batch_size'], x_train, x_val, y_train, y_val)
        else:
            #Creamos 5 paquetes para la validación cruzada
            kf = KFold(n_splits=5)
            modelo = modelo.fit(x_train, y_train)
            val_scores = cross_val_score(modelo, x_train, y_train, cv=kf, scoring="accuracy")
            print("Val_accuracy:", val_scores.mean())

            y_train_pred = modelo.predict(x_train)
            accuracy = accuracy_score(y_train, y_train_pred)
            print('Test accuracy:  %.4f' % accuracy)
        print("\n")
    print("-----------------------\n")

## 5.2 Funciones para crear modelos y obtener métricas

In [33]:
import warnings
warnings.filterwarnings("ignore")

def creaModeloMultimodal(input_dim, neuronas_capa_1, neuronas_capa_2, neuronas_capa_3, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Dense(neuronas_capa_1, input_dim=input_dim, activation="relu"))
    model.add(Dense(neuronas_capa_2, activation="relu"))
    model.add(Dense(neuronas_capa_3, activation="relu"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation="sigmoid"))

    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

def prediceSupervivenciaRedNeuronal(modelo, num_epocas, batch_size, x_train, x_val, y_train, y_val, x_test):
    H = modelo.fit(x_train, y_train, epochs=num_epocas, batch_size=batch_size, validation_data=(x_val, y_val), verbose=0)
    predictions = modelo.predict(x_test, verbose=0)
    return predictions

def predecirSupervivenciaConjuntoModelos(x, y, comb_datos):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=42)
    conjunto_modelos = {}

    input_dim = x_train.shape[1]
    conjunto_modelos["Red neuronal propia"] = creaModeloMultimodal(input_dim, comb_datos['neuronas_capa_1'], comb_datos['neuronas_capa_2'], comb_datos['neuronas_capa_3'], comb_datos['dropout_rate'], comb_datos['learning_rate'])
    conjunto_modelos["Clasificador MLP"] = MLPClassifier(solver='adam', random_state=42)
    conjunto_modelos["Regresión logística"] = LogisticRegression(random_state=42)
    conjunto_modelos["K vecinos"] = KNeighborsClassifier(n_neighbors=6,  metric="euclidean")
    print("-----------------------")
    for i, modelo in conjunto_modelos.items():
        print("Modelo:  " + i)
        if i == "Red neuronal propia":
            y_pred = prediceSupervivenciaRedNeuronal(modelo, comb_datos['epochs'], comb_datos['batch_size'], x_train, x_val, y_train, y_val, x_test)
        else:
            modelo = modelo.fit(x_train, y_train)
            y_pred = modelo.predict(x_test)
        obtenMetricas(y_test, y_pred)
    print("-----------------------\n")

def muestraCombinacionesDatosModelos():
    lista_name_comb_datos = ["Datos clínicos", "Datos genómicos", "Datos radiómicos", "Datos clínicos y genómicos",
                             "Datos clínicos y radiómicos", "Datos genómicos y radiómicos", "Datos clínicos, genómicos y radiómicos"]
    lista_bool_comb_datos = [[True, False, False], [False, True, False], [False, False, True], [True, True, False],
     [True, False, True], [False, True, True], [True, True, True]]
    comb_datos = defineHiperparametrosSegunDatos()
    for i, comb_bool_datos in enumerate(lista_bool_comb_datos):
        df_caract = extraeCaractSegunInput(clinicos=comb_bool_datos[0], genomicos=comb_bool_datos[1], radiomicos=comb_bool_datos[2])
        y = df_caract.pop('Survival Status | Alive').values.astype(int)
        x = df_caract.values.astype(int)
        print("Tipos de datos: ", lista_name_comb_datos[i])
        #print("Tamaño datos: ", x.shape)
        predecirSupervivenciaConjuntoModelos(x, y, comb_datos[lista_name_comb_datos[i]])

muestraCombinacionesDatosModelos()

Tipos de datos:  Datos clínicos
-----------------------
Modelo:  Red neuronal propia
Métricas: {
     Accuracy:  0.6279
     Precision: 0.7273
     Recall: 0.7742
     F1:   0.7500
}
Modelo:  Clasificador MLP
Métricas: {
     Accuracy:  0.6279
     Precision: 0.7419
     Recall: 0.7419
     F1:   0.7419
}
Modelo:  Regresión logística
Métricas: {
     Accuracy:  0.6977
     Precision: 0.7647
     Recall: 0.8387
     F1:   0.8000
}
Modelo:  K vecinos
Métricas: {
     Accuracy:  0.6047
     Precision: 0.7333
     Recall: 0.7097
     F1:   0.7213
}
-----------------------

Tipos de datos:  Datos genómicos
-----------------------
Modelo:  Red neuronal propia
Métricas: {
     Accuracy:  0.7273
     Precision: 0.7059
     Recall: 0.9231
     F1:   0.8000
}
Modelo:  Clasificador MLP
Métricas: {
     Accuracy:  0.5909
     Precision: 0.6250
     Recall: 0.7692
     F1:   0.6897
}
Modelo:  Regresión logística
Métricas: {
     Accuracy:  0.7727
     Precision: 0.7500
     Recall: 0.9231
     F1: 