In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from scipy import stats
from pathlib import Path

# Configurações
CATEGORICAL_VARIABLES = [
    'burn_perio', 'land_use', 'zp_link', 
    'year', 'fuel_model', 'landform'
]

# Shapefile específico
SHP_FILE = r'../../Data/Processed/PT-FireSprd_v3.0/L2_FireBehavior/PT-FireSprd_v3.0_L2_model.shp'

# Diretório de saída
OUTPUT_DIR = r'../../Data/Data_Exploration'


In [29]:
print(f"Carregando shapefile: {SHP_FILE}...")
df = gpd.read_file(SHP_FILE)
print(f"Colunas: {list(df.columns)}")
print(f"Número de linhas: {len(df)}")


Carregando shapefile: ../../Data/Processed/PT-FireSprd_v2.1/L2_FireBehavior/PT-FireProg_v2.1_L2_short.shp...
Colunas: ['fid', 'fname', 'year', 'id', 'type', 'sdate', 'edate', 'inidoy', 'enddoy', 'source', 'zp_link', 'burn_perio', 'area', 'growth_rat', 'ros_i', 'ros_p', 'spdir_i', 'spdir_p', 'int_i', 'int_p', 'duration_i', 'duration_p', 'qc', 'elev_av', 'aspect_av', 'landform', 'land_use', 'land_use_d', '1_3y_fir_p', '3_8y_fir_p', '8_ny_fir_p', 'fuel_model', 'f_load_av', 'sW_1m_av', 'sW_3m_av', 'sW_7_av', 'sW_28_av', 'sW_100_av', 'sW_289_av', 't_2m_C_av', 'd_2m_C_av', 'rh_2m_av', 'VPD_Pa_av', 'sP_hPa_av', 'gp_m2s2_av', 'dfmc_av', 'HDW_av', 'Haines_av', 'FWI_12h_av', 'DC_12h_av', 'FFMC_12h_a', 'wv10_kh_av', 'wdir10_av', 'wv100_k_av', 'wdir100_av', 'Recirc', 'CircVar', 't_950_av', 't_850_av', 't_700_av', 't_500_av', 't_300_av', 'rh_950_av', 'rh_850_av', 'rh_700_av', 'rh_500_av', 'rh_300_av', 'wv_950_av', 'wv_850_av', 'wv_700_av', 'wv_500_av', 'wv_300_av', 'wdi_950_av', 'wdi_850_av', 'wdi_

In [30]:
# Converter colunas numéricas
for col in df.columns:
    if df[col].dtype == 'object' and col not in CATEGORICAL_VARIABLES:
        converted = pd.to_numeric(df[col], errors='coerce')
        if not converted.isna().all():
            df[col] = converted


In [31]:
categorical_cols = [col for col in CATEGORICAL_VARIABLES if col in df.columns]
numeric_cols = df.select_dtypes(include=['number']).columns.difference(['sdate'] + categorical_cols).tolist()

print(f"Colunas numéricas: {numeric_cols}")
print(f"Colunas categóricas: {categorical_cols}")


Colunas numéricas: ['1_3y_fir_p', '3_8y_fir_p', '8_ny_fir_p', 'BLH_m_av', 'BLH_m_rt', 'CBH_m_av', 'CCL_hPa_av', 'CMLG_av', 'Cape_av', 'Cin_av', 'CircVar', 'DC_12h_av', 'EL_m_av', 'FFMC_12h_a', 'FWI_12h_av', 'HDW_av', 'HigCC_p_av', 'LCL_hPa_av', 'LFC_hPa_av', 'LiftIdx_av', 'LowCC_p_av', 'MidCC_p_av', 'Recirc', 'TotCC_p_av', 'VPD_Pa_av', 'VentIdx_av', 'area', 'aspect_av', 'd_2m_C_av', 'dfmc_av', 'duration_i', 'duration_p', 'elev_av', 'enddoy', 'f_load_av', 'f_start', 'fid', 'gT_5_3_av', 'gT_7_5_av', 'gT_8_7_av', 'gT_9_8_av', 'gT_s_9_av', 'gp_300_av', 'gp_500_av', 'gp_700_av', 'gp_850_av', 'gp_950_av', 'gp_m2s2_av', 'growth_rat', 'id', 'inidoy', 'int_i', 'int_p', 'qc', 'rh_2m_av', 'rh_300_av', 'rh_500_av', 'rh_700_av', 'rh_850_av', 'rh_950_av', 'ros_i', 'ros_p', 'ros_p_lg1', 'sP_hPa_av', 'sW_100_av', 'sW_1m_av', 'sW_289_av', 'sW_28_av', 'sW_3m_av', 'sW_7_av', 'spdir_i', 'spdir_p', 't_2m_C_av', 't_300_av', 't_500_av', 't_700_av', 't_850_av', 't_950_av', 'vwv_300_av', 'vwv_500_av', 'vwv_700

In [32]:
numeric_stats = pd.DataFrame(index=numeric_cols)

# Conversão para numérico
df_numeric = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Estatísticas básicas
numeric_stats['n'] = df_numeric.count()
numeric_stats['n_total'] = len(df)
numeric_stats['n_missing'] = df_numeric.isna().sum()
numeric_stats['pct_missing'] = (numeric_stats['n_missing'] / len(df)) * 100
numeric_stats['pct_complete'] = (numeric_stats['n'] / len(df)) * 100
numeric_stats['mean'] = df_numeric.mean()
numeric_stats['median'] = df_numeric.median()
numeric_stats['std'] = df_numeric.std()
numeric_stats['var'] = df_numeric.var()
numeric_stats['sem'] = df_numeric.sem()
numeric_stats['cv'] = (numeric_stats['std'] / numeric_stats['mean']) * 100
numeric_stats['min'] = df_numeric.min()
numeric_stats['max'] = df_numeric.max()
numeric_stats['range'] = numeric_stats['max'] - numeric_stats['min']
numeric_stats['q25'] = df_numeric.quantile(0.25)
numeric_stats['q75'] = df_numeric.quantile(0.75)
numeric_stats['AIQ'] = numeric_stats['q75'] - numeric_stats['q25']
numeric_stats['skewness'] = df_numeric.skew()
numeric_stats['kurtosis'] = df_numeric.kurtosis()
numeric_stats['n_zeros'] = (df_numeric == 0).sum()
numeric_stats['pct_zeros'] = (numeric_stats['n_zeros'] / len(df)) * 100
numeric_stats['n_negative'] = (df_numeric < 0).sum()
numeric_stats['pct_negative'] = (numeric_stats['n_negative'] / len(df)) * 100
numeric_stats['n_positive'] = (df_numeric > 0).sum()
numeric_stats['pct_positive'] = (numeric_stats['n_positive'] / len(df)) * 100

# Testes de normalidade
for col in numeric_cols:
    col_series = df_numeric[col].dropna()
    if len(col_series) >= 8:
        try:
            stat, p = stats.normaltest(col_series)
            numeric_stats.loc[col, 'normality_stat'] = stat
            numeric_stats.loc[col, 'normality_p'] = p
            if len(col_series) <= 5000:
                shapiro_stat, shapiro_p = stats.shapiro(col_series)
                numeric_stats.loc[col, 'shapiro_stat'] = shapiro_stat
                numeric_stats.loc[col, 'shapiro_p'] = shapiro_p
            else:
                numeric_stats.loc[col, 'shapiro_stat'] = np.nan
                numeric_stats.loc[col, 'shapiro_p'] = np.nan
        except:
            numeric_stats.loc[col, 'normality_stat'] = np.nan
            numeric_stats.loc[col, 'normality_p'] = np.nan
            numeric_stats.loc[col, 'shapiro_stat'] = np.nan
            numeric_stats.loc[col, 'shapiro_p'] = np.nan
    else:
        numeric_stats.loc[col, 'normality_stat'] = np.nan
        numeric_stats.loc[col, 'normality_p'] = np.nan
        numeric_stats.loc[col, 'shapiro_stat'] = np.nan
        numeric_stats.loc[col, 'shapiro_p'] = np.nan


In [33]:
categorical_stats = pd.DataFrame(index=categorical_cols)
categorical_stats['n_total'] = len(df)
categorical_stats['n'] = df[categorical_cols].notna().sum()
categorical_stats['n_missing'] = df[categorical_cols].isna().sum()
categorical_stats['pct_missing'] = (categorical_stats['n_missing'] / len(df)) * 100
categorical_stats['pct_complete'] = (categorical_stats['n'] / len(df)) * 100

for col in categorical_cols:
    value_counts = df[col].value_counts()
    categorical_stats.loc[col, 'n_unique'] = df[col].nunique()
    categorical_stats.loc[col, 'pct_unique'] = (categorical_stats.loc[col, 'n_unique'] / categorical_stats.loc[col, 'n']) * 100
    mode_val = df[col].mode()
    if len(mode_val) > 0:
        categorical_stats.loc[col, 'mode'] = mode_val[0]
        categorical_stats.loc[col, 'mode_freq'] = value_counts.iloc[0]
        categorical_stats.loc[col, 'mode_pct'] = (value_counts.iloc[0] / categorical_stats.loc[col, 'n']) * 100
        if len(value_counts) > 1:
            categorical_stats.loc[col, 'mode2'] = value_counts.index[1]
            categorical_stats.loc[col, 'mode2_freq'] = value_counts.iloc[1]
            categorical_stats.loc[col, 'mode2_pct'] = (value_counts.iloc[1] / categorical_stats.loc[col, 'n']) * 100
        else:
            categorical_stats.loc[col, 'mode2'] = np.nan
            categorical_stats.loc[col, 'mode2_freq'] = np.nan
            categorical_stats.loc[col, 'mode2_pct'] = np.nan
    else:
        categorical_stats.loc[col, ['mode', 'mode_freq', 'mode_pct', 'mode2', 'mode2_freq', 'mode2_pct']] = np.nan

    # Entropia
    probs = df[col].value_counts(normalize=True)
    entropy = -np.sum(probs * np.log2(probs))
    categorical_stats.loc[col, 'entropy'] = entropy
    categorical_stats.loc[col, 'max_entropy'] = np.log2(df[col].nunique()) if df[col].nunique() > 0 else 0
    categorical_stats.loc[col, 'rel_entropy'] = entropy / categorical_stats.loc[col, 'max_entropy'] if categorical_stats.loc[col, 'max_entropy'] > 0 else 0


In [34]:
def sign_log(series):
    s = pd.to_numeric(series, errors='coerce')
    return np.sign(s) * np.log1p(np.abs(s))

all_cols = numeric_cols + categorical_cols
ros_corrs = pd.DataFrame(index=all_cols)

if 'ros_p' in df.columns:
    ros_series = pd.to_numeric(df['ros_p'], errors='coerce')
    ros_log_series = sign_log(df['ros_p'])

    # Numéricas
    for col in numeric_cols:
        if col != 'ros_p':
            col_series = pd.to_numeric(df[col], errors='coerce')
            valid_idx = ros_series.notna() & col_series.notna()
            if valid_idx.sum() > 1:
                corr = ros_series[valid_idx].corr(col_series[valid_idx])
                ros_corrs.loc[col, 'corr_ros_p'] = corr
                ros_corrs.loc[col, 'R2_ros_p'] = corr**2
            valid_log_idx = ros_log_series.notna() & col_series.notna()
            if valid_log_idx.sum() > 1:
                corr_log = ros_log_series[valid_log_idx].corr(col_series[valid_log_idx])
                ros_corrs.loc[col, 'corr_log_ros_p'] = corr_log
                ros_corrs.loc[col, 'R2_log_ros_p'] = corr_log**2

    # Categóricas (eta²)
    for col in categorical_cols:
        valid_idx = ros_series.notna() & df[col].notna()
        if valid_idx.sum() > 1 and df.loc[valid_idx, col].nunique() >= 2:
            groups = df.loc[valid_idx, col]
            values = ros_series[valid_idx]
            group_means = values.groupby(groups).mean()
            overall_mean = values.mean()
            ss_between = sum(values.groupby(groups).size() * (group_means - overall_mean) ** 2)
            ss_total = sum((values - overall_mean) ** 2)
            if ss_total > 0:
                eta_sq = ss_between / ss_total
                ros_corrs.loc[col, 'R2_ros_p'] = eta_sq
                ros_corrs.loc[col, 'corr_ros_p'] = np.sqrt(eta_sq)


In [35]:
desc_stats = numeric_stats.combine_first(categorical_stats)
desc_stats = desc_stats.combine_first(ros_corrs)

# VIF
for col in desc_stats.index:
    r2 = desc_stats.loc[col].get('R2_ros_p', np.nan)
    if pd.notna(r2) and r2 < 1:
        desc_stats.loc[col, 'VIF'] = 1 / (1 - r2)
    else:
        desc_stats.loc[col, 'VIF'] = np.nan


In [36]:
# Adiciona coluna de tipo de variável
desc_stats['var_type'] = 'numeric'
desc_stats.loc[categorical_cols, 'var_type'] = 'categorical'

# Cria diretório de saída se não existir
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Gera nome de arquivo baseado no nome do shapefile
shp_name = Path(SHP_FILE).stem
output_file = Path(OUTPUT_DIR) / f'Descriptive_stats_{shp_name}.csv'

# Salva CSV
desc_stats.to_csv(output_file)
print(f"Estatísticas salvas em: {output_file}")

Estatísticas salvas em: ..\..\Data\Data_Exploration\Descriptive_stats_PT-FireProg_v2.1_L2_short.csv
