In [None]:
# ==============================
# Pré-processamento dos dados
# ============================== 

# Importar bibliotecas essenciais
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

# ------------------------------
# 1. Carregar o dataset
# ------------------------------
data_path = "../data/star_dataset.csv"
df = pd.read_csv(data_path)

# Visualizar estrutura inicial
df.head()



Unnamed: 0,Name,Distance (ly),Luminosity (L/Lo),Radius (R/Ro),Temperature (K),Spectral Class
0,Altair,16.594171,9.979192,1.63265,7509.294247,A7V
1,Deneb,2600.490723,196002.627856,202.970526,8503.284796,A2Ia
2,Barnard's Star,6.052616,4.893716,0.222711,3165.959639,M4Ve
3,Polaris,322.601002,2196.241934,37.546813,6048.326915,F7Ib
4,Barnard's Star,5.902392,-1.496486,0.192359,3130.602069,M4Ve


In [None]:
# ---------------------------------------------------------------
# 2. Seleção de features numéricas - atributos de interesse 
# ---------------------------------------------------------------

# Essas serão as variáveis usadas no clustering
features = [
    "Temperature (K)",
    "Luminosity (L/Lo)",
    "Radius (R/Ro)",
    "Distance (ly)"
]

X = df[features].copy()

# Guardar a classe espectral separadamente (NÃO entra no treino)
df["Spectral_Class_Main"] = df["Spectral Class"].str[0]
spectral_class = df["Spectral_Class_Main"]

In [None]:
# --------------------------------------
# 3. Tratamento de valores inválidos
# --------------------------------------

# Algumas luminosidades podem ser negativas (dataset sintético)
X = X.replace([np.inf, -np.inf], np.nan)
X = X.dropna()

# Ajustar também a classe espectral
spectral_class = spectral_class.loc[X.index]


In [None]:
# ------------------------------
# 4. Normalização dos dados
# ------------------------------

# Essencial para clustering (especialmente distância)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Converter para DataFrame
X_scaled = pd.DataFrame(
    X_scaled,
    columns=features,
    index=X.index
)

X_scaled.head()

Unnamed: 0,Temperature (K),Luminosity (L/Lo),Radius (R/Ro),Distance (ly)
0,-0.313069,-0.465255,-0.399209,-0.51535
1,-0.187296,4.178847,0.542754,4.258967
2,-0.862648,-0.465376,-0.405805,-0.534828
3,-0.497931,-0.413451,-0.231184,0.050065
4,-0.867122,-0.465527,-0.405947,-0.535105


In [None]:
# ----------------------------------
# 5. Salvar dados processados
# ----------------------------------

X_scaled.to_csv("../results/X_scaled.csv", index=True)
spectral_class.to_csv("../results/spectral_class.csv", index=True)

print("Pré-processamento finalizado.")

Pré-processamento finalizado.
