In [None]:
# Importação das bibliotecas necessárias
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

# Define a característica clínica de interesse que desejamos prever
DF_CLINICAL_FEATURE_OF_INTEREST = '3-Gene classifier subtype'

print("Reading file")

# Carrega o conjunto de dados pré-processado anteriormente
# df = pd.read_csv('dataset_preprocessed.csv', header=0, index_col=0)
df = pd.read_pickle('dataset_preprocessed.pkl')
#verifica se tem nan
if df.isnull().values.any():
    print("NaN values found. Removing...")
    df.dropna(inplace=True)
    print("NaN values removed.")
else:
    print("No NaN values found.")
    
# Lista de colunas clínicas do paciente
df_clinical_patient_cols = ['Lymph nodes examined positive', 'Nottingham prognostic index',
       'Cellularity', 'Chemotherapy', 'Cohort', 'ER status measured by IHC',
       'HER2 status measured by SNP6', 'Hormone Therapy',
       'Inferred Menopausal State', 'Sex', 'Integrative Cluster',
       'Age at Diagnosis', 'Overall Survival (Months)',
       'Overall Survival Status', 'Pam50 + Claudin-low subtype',
       '3-Gene classifier subtype', "Patient's Vital Status",
       'Primary Tumor Laterality', 'Radio Therapy',
       'Tumor Other Histologic Subtype', 'Type of Breast Surgery',
       'Relapse Free Status', 'Relapse Free Status (Months)']

# Remove as colunas clínicas, exceto a de interesse
df_clinical_patient_cols.remove(DF_CLINICAL_FEATURE_OF_INTEREST)
df.drop(columns=df_clinical_patient_cols, inplace=True)

print("Saving X and y")

# Define y como sendo a característica que queremos prever
y = df[DF_CLINICAL_FEATURE_OF_INTEREST].copy()

# Define X como sendo todas as outras colunas
X = df[[x for x in df.columns if x != DF_CLINICAL_FEATURE_OF_INTEREST]]
print(X.columns)

# print("Value counts of column of interest")
# print(y.value_counts())

# Divide os dados em conjuntos de treino/validação e teste
X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Código para visualização TSNE 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px


X_embedded = TSNE(n_components=3, learning_rate='auto', init='random', perplexity=3).fit_transform(X)
X_embedded.shape
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
palette = sns.color_palette("bright", 4)

sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=y, legend='full', palette=palette)
fig = px.scatter_3d(x=X_embedded[:, 0], y=X_embedded[:, 1], z=X_embedded[:, 2], color=y, opacity=0.8)
fig.show()

# Utiliza f-value para realizar a seleção de características
modelo = SelectKBest(f_classif, k=1400).fit(X_train_validation, y_train_validation)


X_train_validation_f_value = modelo.transform(X_train_validation)


try:
    # Selecione os índices das k características mais importantes
    selected_indices = modelo.get_support(indices=True)
    
    # Verifique se os índices estão dentro dos limites
    if max(selected_indices) >= len(feature_names_f_value):
        print(f"Warning: Adjusting indices. Max index is {max(selected_indices)} but length of feature_names is {len(feature_names_f_value)}.")
        selected_indices = [idx if idx < len(feature_names_f_value) else len(feature_names_f_value) - 1 for idx in selected_indices]

    # Obtenha os nomes e valores F das características selecionadas
    selected_feature_names = feature_names_f_value[selected_indices]
    selected_f_values = f_values[selected_indices]
    
    # Ordena os valores de f-value e os nomes das características de forma descendente
    sorted_indices = selected_f_values.argsort()[::-1]
    sorted_feature_names = selected_feature_names[sorted_indices]
    sorted_f_values = selected_f_values[sorted_indices]
    
    # Salvar os nomes e valores F ordenados em um arquivo txt
    with open('sorted_features_f_value2.txt', 'w') as f:
        for name, score in zip(sorted_feature_names, sorted_f_values):
            f.write(f"{name}\t{score}\n")

except IndexError as e:
    print(f"Error: {e}. Number of feature names: {len(feature_names_f_value)}, Number of selected indices: {len(selected_indices)}")

        

# Guarda os dados transformados
with open('X_train_validation_f-valuecopy.pkl','wb') as f:
    pickle.dump(X_train_validation, f)
with open('y_train_validation_f-valuecopy.pkl','wb') as f:
    pickle.dump(y_train_validation, f)
with open('X_test_f-valuecopy.pkl','wb') as f:
    pickle.dump(X_test, f)
with open('y_test_f-valuecopy.pkl','wb') as f:
    pickle.dump(y_test, f)

# Utiliza PCA para reduzir a dimensionalidade
# print(X.iloc[0])
# print(X.shape)
# pca = PCA(n_components=1400)
# pca.fit(X)

pca = PCA(n_components=1202).fit(X_train_validation, y_train_validation)
X_train_validation_pca = pca.transform(X_train_validation)
X_test_pca = pca.transform(X_test)


# Salvar as componentes principais em um ficheiro txt
with open('features_pca.txt', 'w') as f:
    for i, component in enumerate(pca.components_):
        f.write(f"Component {i+1}\t{component}\n")
        
# print(pca.singular_values_)
#quantidade de variância que cada componente principal 
# X2 = pca.inverse_transform(X)

print(pca.components_[0])
print(pca.components_.shape)
print(sum(pca.explained_variance_ratio_))

with open('X_train_validation_pcacopy.pkl','wb') as f:
    pickle.dump(X_train_validation, f)
with open('y_train_validation_pcacopy.pkl','wb') as f:
    pickle.dump(y_train_validation, f)
with open('X_test_pcacopy.pkl','wb') as f:
    pickle.dump(X_test, f)
with open('y_test_pcacopy.pkl','wb') as f:
    pickle.dump(y_test, f)

#model.generation_scores_