In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
data_train = pd.read_csv('train_radiomics_hipocamp_mod.csv')

In [3]:
target_feature = 'Transition'
data_train_mod = data_train.drop(columns=[target_feature])

In [4]:
# Normalização do dataset
scaler = MinMaxScaler()

normalized_train_data = scaler.fit_transform(data_train_mod)
normalized_train_df = pd.DataFrame(normalized_train_data, columns=data_train_mod.columns)

In [5]:
corr_matrix = normalized_train_df.corr(method="pearson")

In [6]:
# Obter pares com alta correlação
high_corr_pairs = [
    (col1, col2) for col1 in corr_matrix.columns
    for col2 in corr_matrix.columns
    if col1 != col2 and abs(corr_matrix.loc[col1, col2]) > 0.9
]

In [7]:
# Carregar o CSV com as importâncias
importance_df = pd.read_csv("feature_importances.csv")
print(importance_df.head())

                                             Feature  Importance
0  wavelet-LLH_glszm_SizeZoneNonUniformityNormalized    0.018075
1                wavelet-LLH_glszm_SmallAreaEmphasis    0.016668
2                   lbp-3D-m1_glrlm_ShortRunEmphasis    0.011822
3  log-sigma-2-0-mm-3D_firstorder_InterquartileRange    0.011612
4                 wavelet-LLL_glrlm_ShortRunEmphasis    0.011541


In [8]:
# Criar uma lista para armazenar as features a serem removidas
features_to_remove = []

for col1, col2 in high_corr_pairs:
    # Verificar se as features estão presentes no CSV
    if col1 in importance_df["Feature"].values and col2 in importance_df["Feature"].values:
        # Obter as importâncias de col1 e col2
        importance_col1 = importance_df.loc[importance_df["Feature"] == col1, "Importance"].values[0]
        importance_col2 = importance_df.loc[importance_df["Feature"] == col2, "Importance"].values[0]

        # Decidir qual feature remover
        if importance_col1 < importance_col2:
            features_to_remove.append(col1)
        else:
            features_to_remove.append(col2)
    else:
        print(f"Feature ausente no CSV de importâncias: {col1} ou {col2}")

# Remover duplicatas
features_to_remove = list(set(features_to_remove))
print("Features a serem removidas:", features_to_remove)



Features a serem removidas: ['logarithm_glcm_JointEnergy', 'wavelet-HHL_firstorder_Range', 'wavelet-LHH_glcm_Idm', 'square_firstorder_Variance', 'log-sigma-1-0-mm-3D_glszm_HighGrayLevelZoneEmphasis', 'lbp-3D-m2_gldm_LargeDependenceHighGrayLevelEmphasis', 'exponential_glrlm_LongRunLowGrayLevelEmphasis', 'wavelet-HHH_glrlm_LowGrayLevelRunEmphasis', 'lbp-3D-m2_firstorder_Variance', 'wavelet-HHH_glcm_ClusterTendency', 'square_glcm_SumAverage', 'logarithm_gldm_GrayLevelVariance', 'exponential_firstorder_RobustMeanAbsoluteDeviation', 'wavelet-LLH_glcm_SumSquares', 'log-sigma-1-0-mm-3D_firstorder_TotalEnergy', 'lbp-3D-m2_glrlm_RunLengthNonUniformityNormalized', 'square_glrlm_RunVariance', 'wavelet-HHL_glcm_JointAverage', 'original_glrlm_GrayLevelNonUniformity', 'logarithm_glszm_LargeAreaEmphasis', 'exponential_firstorder_Energy', 'log-sigma-3-0-mm-3D_glcm_DifferenceVariance', 'lbp-3D-k_glrlm_RunEntropy', 'wavelet-LLL_glcm_ClusterTendency', 'wavelet-LHL_glcm_Autocorrelation', 'wavelet-LHH_firs

In [9]:
df_train = pd.read_csv("train_important_features.csv")
df_test = pd.read_csv("test_important_features.csv")

In [12]:
# Remover as features identificadas
df_train_reduced = df_train.drop(columns=features_to_remove, errors='ignore')
print("Shape do novo DataFrame de treino:", df_train_reduced.shape)

df_test_reduced = df_test.drop(columns=features_to_remove, errors='ignore')
print("Shape do novo DataFrame de teste:", df_test_reduced.shape)


Shape do novo DataFrame de treino: (305, 545)
Shape do novo DataFrame de teste: (100, 544)


In [13]:
df_train_reduced.tail()

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_MinorAxisLength,original_shape_Sphericity,original_glcm_JointEnergy,original_gldm_LargeDependenceHighGrayLevelEmphasis,original_gldm_SmallDependenceEmphasis,...,lbp-3D-k_glrlm_ShortRunLowGrayLevelEmphasis,lbp-3D-k_glszm_GrayLevelNonUniformityNormalized,lbp-3D-k_glszm_LargeAreaEmphasis,lbp-3D-k_glszm_SizeZoneNonUniformity,lbp-3D-k_glszm_SmallAreaLowGrayLevelEmphasis,lbp-3D-k_glszm_ZoneEntropy,lbp-3D-k_glszm_ZonePercentage,Sex,Age,Transition
300,0.401673,0.101371,10.835627,77.233412,42.720019,42.935157,0.390228,0.267523,2536.320402,0.021376,...,0.32458,0.949401,279885.961039,23.415584,0.142034,2.747529,0.010593,1,72.2,CN-CN
301,0.445362,0.090595,8.986151,70.007142,45.486262,44.175383,0.379693,0.289005,2530.842159,0.020571,...,0.34342,0.895062,376917.916667,11.333333,0.143174,2.497136,0.00617,0,75.0,CN-CN
302,0.437854,0.106585,11.2547,82.219219,45.099889,46.234417,0.366989,0.220869,2303.168718,0.020472,...,0.335548,0.942907,300979.308824,17.823529,0.128431,2.57171,0.009848,1,79.6,MCI-AD
303,0.463489,0.095407,9.694175,74.027022,46.400431,47.094617,0.377038,0.299312,1373.925554,0.017092,...,0.315389,0.961919,216591.330097,30.320388,0.139708,2.688709,0.014174,0,62.4,MCI-MCI
304,0.422987,0.084201,8.420713,74.060786,40.804412,42.30189,0.394758,0.26122,1268.704879,0.016126,...,0.305772,0.945988,259604.361111,19.861111,0.135206,2.751118,0.010775,0,73.2,CN-CN


In [14]:
df_train_reduced.to_csv("train_important_features_correlated.csv", index=False)
df_test_reduced.to_csv("test_important_features_correlated.csv", index=False)