# Analises de amostras 

## Bibliotecas

In [1]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import numpy as np

## Passo 1: Carregar e Preparar os Dados

Vamos carregar a tabela e preparar os dados para o modelo.

In [2]:
path_base = '/home/superuser/Dados/mapbiomas/dadosCol10/ROIsv2/ROIsv2'
pathFeaturesBase = '/home/superuser/Dados/mapbiomas/dadosCol10/fileFeatSelect'
lstfiles = glob.glob(path_base + '/*')
# print(lstfiles)
lstpathfiles = []
for cc, npath in enumerate(lstfiles):
    if 'rois_grade' not in npath:
        print(f"#{cc} >> {npath}")
        lstpathfiles.append(npath)

#0 >> /home/superuser/Dados/mapbiomas/dadosCol10/ROIsv2/ROIsv2/7619.csv
#3 >> /home/superuser/Dados/mapbiomas/dadosCol10/ROIsv2/ROIsv2/7712.csv
#4 >> /home/superuser/Dados/mapbiomas/dadosCol10/ROIsv2/ROIsv2/765.csv
#6 >> /home/superuser/Dados/mapbiomas/dadosCol10/ROIsv2/ROIsv2/7746.csv
#10 >> /home/superuser/Dados/mapbiomas/dadosCol10/ROIsv2/ROIsv2/7615.csv
#11 >> /home/superuser/Dados/mapbiomas/dadosCol10/ROIsv2/ROIsv2/773.csv


In [6]:
# Supondo que a tabela esteja em um arquivo CSV
df = pd.read_csv(lstpathfiles[2])

# Colunas de features (índices espectrais)
columns_features = [
    "afvi_median", "afvi_median_dry", "afvi_median_wet", "avi_median", "avi_median_dry", "avi_median_wet",
    "awei_median", "awei_median_dry", "awei_median_wet", "blue_median", "blue_median_dry",
    "blue_median_wet", "blue_stdDev", "brba_median", "brba_median_dry", "brba_median_wet",
    "brightness_median", "brightness_median_dry", "brightness_median_wet", "bsi_median", "bsi_median_1",
    "bsi_median_2", "cvi_median", "cvi_median_dry", "cvi_median_wet",
    "dswi5_median", "dswi5_median_dry", "dswi5_median_wet", "evi_median", "evi_median_dry",
    "evi_median_wet", "gcvi_median", "gcvi_median_dry", "gcvi_median_wet", "gemi_median",
    "gemi_median_dry", "gemi_median_wet", "gli_median", "gli_median_dry", "gli_median_wet",
    "green_median", "green_median_dry", "green_median_wet", "green_stdDev", "gvmi_median",
    "gvmi_median_dry", "gvmi_median_wet", "hillshade", "iia_median", "iia_median_dry",
    "iia_median_wet", "lswi_median", "lswi_median_dry", "lswi_median_wet", "mbi_median",
    "mbi_median_dry", "mbi_median_wet", "nddi_median", "nddi_median_dry", "nddi_median_wet",
    "ndvi_median", "ndvi_median_dry", "ndvi_median_wet", "ndwi_median", "ndwi_median_dry",
    "ndwi_median_wet", "nir_median", "nir_median_contrast", "nir_median_dry", "nir_median_dry_contrast",
    "nir_median_wet", "nir_stdDev", "osavi_median", "osavi_median_dry", "osavi_median_wet",
    "ratio_median", "ratio_median_dry", "ratio_median_wet", "red_median", "red_median_contrast",
    "red_median_dry", "red_median_dry_contrast", "red_median_wet", "red_stdDev", "ri_median",
    "ri_median_dry", "ri_median_wet", "rvi_median", "rvi_median_1", "rvi_median_wet",
    "shape_median", "shape_median_dry", "shape_median_wet", "solpe", "swir1_median",
    "swir1_median_dry", "swir1_median_wet", "swir1_stdDev", "swir2_median", "swir2_median_dry",
    "swir2_median_wet", "swir2_stdDev", "ui_median", "ui_median_dry", "ui_median_wet",
    "wetness_median", "wetness_median_dry", "wetness_median_wet",
]

# Coluna alvo (classes)
target_column = "class"

# Dividir os dados em treino e teste
X = df[columns_features]
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
print(f"we have shape of X_train {X_train.shape} ")

we have shape of X_train (56725, 108) 


## Passo 2: Seleção de Features com RFECV

Agora, vamos aplicar o RFECV para selecionar as melhores features.

In [7]:
# Criar o modelo base (Gradient Boosting)
gb_model = GradientBoostingClassifier(random_state=42)

# Configurar o RFECV
rfecv = RFECV(
    estimator=gb_model,
    step=1,
    cv=StratifiedKFold(3),  # Validação cruzada estratificada
    scoring='accuracy',
    min_features_to_select=10,  # Número mínimo de features a serem selecionadas
    n_jobs=-1  # Paralelizar o processo
)

# Aplicar o RFECV aos dados de treino
rfecv.fit(X_train, y_train)

# Mostrar as features selecionadas
selected_features = X_train.columns[rfecv.support_]
print("Features selecionadas:", selected_features.tolist())
print("Número ótimo de features:", rfecv.n_features_)

KeyboardInterrupt: 

## Passo 3: Extrair Hiperparâmetros do Gradient Boosting

Agora que temos as features selecionadas, vamos treinar o modelo Gradient Boosting com esses dados e extrair os hiperparâmetros otimizados.

In [None]:
# Filtrar as features selecionadas
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Treinar o modelo com as features selecionadas
gb_model.fit(X_train_selected, y_train)

# Avaliar o modelo
y_pred = gb_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {accuracy:.4f}")

# Extrair os hiperparâmetros do modelo
hyperparameters = gb_model.get_params()
print("Hiperparâmetros do modelo:")
for param, value in hyperparameters.items():
    print(f"{param}: {value}")

## Passo 4: Salvar os Resultados

Você pode salvar as features selecionadas e os hiperparâmetros em arquivos para uso futuro.

In [None]:
# Salvar as features selecionadas
selected_features_df = pd.DataFrame({'selected_features': selected_features})
selected_features_df.to_csv('selected_features.csv', index=False)

# Salvar os hiperparâmetros
hyperparameters_df = pd.DataFrame(list(hyperparameters.items()), columns=['parameter', 'value'])
hyperparameters_df.to_csv('hyperparameters.csv', index=False)