In [9]:
import pandas as pd
import os
import glob

# Define o path pattern para todos os CSVs
path_pattern = '../../Data/Data_Exploration/descriptive_stats_*.csv'

# Encontrar todos os CSVs
csv_files = glob.glob(path_pattern)

# Garantir que o CSV model_sqrt esteja incluído
sqrt_file = '../../Data/Data_Exploration/descriptive_stats_PT-FireProg_v2.1_L2_model_sqrt.csv'
if sqrt_file not in csv_files:
    csv_files.append(sqrt_file)

# Dicionário para armazenar as correlações por variável
correlations_data = {}

# Processar cada CSV
for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)

        if 'corr_ros_p' in df.columns:  # Usar corr_ros_p em vez de R2_ros_p
            filename = os.path.basename(file_path).lower()
            
            # Determinar tipo de transformação
            if 'model' in filename and 'log' in filename:
                col_suffix = 'model_log'
            elif 'model' in filename and 'sqrt' in filename:
                col_suffix = 'model_sqrt'
            elif 'model' in filename:
                col_suffix = 'model'
            else:
                print(f"Warning: Could not determine type for {filename}")
                continue

            # Extrair nomes das variáveis e valores de corr_ros_p
            for idx, row in df.iterrows():
                if pd.notna(row['corr_ros_p']):
                    variable_name = df.iloc[idx, 0]  # Primeira coluna tem os nomes das variáveis
                    
                    if variable_name not in correlations_data:
                        correlations_data[variable_name] = {'variable': variable_name}
                    
                    correlations_data[variable_name][f'corr_ros_p_{col_suffix}'] = row['corr_ros_p']

        else:
            print(f"Warning: corr_ros_p column not found in {file_path}")

    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

# Criar DataFrame consolidado
if correlations_data:
    correlations_df = pd.DataFrame(correlations_data.values())

    # Colunas desejadas
    desired_columns = [
        'variable',
        'corr_ros_p_model',
        'corr_ros_p_model_log',
        'corr_ros_p_model_sqrt'
    ]

    # Adicionar colunas ausentes com NaN
    for col in desired_columns:
        if col not in correlations_df.columns:
            correlations_df[col] = None

    correlations_df = correlations_df[desired_columns]

    # Salvar CSV final
    output_file = '../../Data/Data_Exploration/all_ros_p_correlations.csv'
    correlations_df.to_csv(output_file, index=False)

    print(f"Successfully created {output_file}")
    print(f"Total variables: {len(correlations_df)}")
    print(f"Files processed: {len(csv_files)}")
    print(f"\nDataFrame structure:")
    print(correlations_df.head())

else:
    print("No correlation data found in any of the files")


Successfully created ../../Data/Data_Exploration/all_ros_p_correlations.csv
Total variables: 106
Files processed: 4

DataFrame structure:
     variable  corr_ros_p_model  corr_ros_p_model_log  corr_ros_p_model_sqrt
0  1_3y_fir_p         -0.041748              0.008214              -0.010580
1  3_8y_fir_p          0.026505              0.132478               0.094862
2  8_ny_fir_p         -0.051356              0.041549              -0.005699
3    BLH_m_av          0.169148              0.248752               0.229735
4    BLH_m_rt          0.050939              0.051698               0.043778


In [11]:
# Calcular top 15 |correlation| para cada tipo de transformação
top_n = 15

for col in ['corr_ros_p_model', 'corr_ros_p_model_sqrt', 'corr_ros_p_model_log']:
    if col in correlations_df.columns:
        # Ordenar por valor absoluto decrescente
        top_vars = correlations_df.reindex(
            correlations_df[col].abs().sort_values(ascending=False).index
        ).head(top_n)
        
        print(f"\nTop {top_n} |correlation| for {col}:")
        print(top_vars[['variable', col]])



Top 15 |correlation| for corr_ros_p_model:
       variable  corr_ros_p_model
50    ros_p_lg1          0.593037
52    ros_p_lg2          0.510589
53    ros_p_lg3          0.488578
54    ros_p_lg4          0.384327
55    ros_p_lg5          0.329380
93     wv100_av          0.325578
100   wv_850_av          0.320757
94   wv100_k_av          0.320109
51   ros_p_lg10          0.306571
56    ros_p_lg6          0.305461
12       HDW_av          0.301808
27   duration_p         -0.294387
82     wSv_1_av          0.291103
101   wv_950_av          0.287039
57    ros_p_lg7          0.282740

Top 15 |correlation| for corr_ros_p_model_sqrt:
       variable  corr_ros_p_model_sqrt
50    ros_p_lg1               0.597953
52    ros_p_lg2               0.556599
53    ros_p_lg3               0.513295
54    ros_p_lg4               0.456006
55    ros_p_lg5               0.387761
56    ros_p_lg6               0.364889
27   duration_p              -0.361356
12       HDW_av               0.356880
93     wv100

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

# Lendo o arquivo CSV
df = pd.read_csv('..\\..\\Data\\Data_Exploration\\all_ros_p_correlations.csv')

# Ordenando pelos maiores R² para cada modelo
top15_linear = df.nlargest(15, 'R2_ros_p_model')[['variable', 'R2_ros_p_model']]
top15_log = df.nlargest(15, 'R2_ros_p_model_log')[['variable', 'R2_ros_p_model_log']]

# Juntando todos os candidatos únicos
all_top_vars = set(top15_linear['variable']).union(set(top15_log['variable']))
comparison_data = df[df['variable'].isin(all_top_vars)]

# Classificando os pontos
comparison_data = comparison_data.copy()
comparison_data['category'] = 'Other'
comparison_data.loc[comparison_data['variable'].isin(top15_linear['variable']), 'category'] = 'Top Linear'
comparison_data.loc[comparison_data['variable'].isin(top15_log['variable']), 'category'] = 'Top Log'
comparison_data.loc[comparison_data['variable'].isin(top15_linear['variable']) & 
                   comparison_data['variable'].isin(top15_log['variable']), 'category'] = 'Top Both'

# Criando scatter plot
fig, ax = plt.subplots(figsize=(12, 8))

# Definindo cores e marcadores para cada categoria
colors = {'Top Linear': 'blue', 'Top Log': 'red', 'Top Both': 'purple', 'Other': 'gray'}
markers = {'Top Linear': 'o', 'Top Log': 's', 'Top Both': 'D', 'Other': 'o'}
sizes = {'Top Linear': 80, 'Top Log': 80, 'Top Both': 100, 'Other': 60}

# Plotando cada categoria separadamente
for category in ['Other', 'Top Linear', 'Top Log', 'Top Both']:
    cat_data = comparison_data[comparison_data['category'] == category]
    ax.scatter(cat_data['R2_ros_p_model'], 
               cat_data['R2_ros_p_model_log'], 
               alpha=0.8, s=sizes[category],
               c=colors[category], marker=markers[category],
               label=category, edgecolors='black', linewidth=0.5)

# Linha de igualdade
max_val = max(comparison_data[['R2_ros_p_model', 'R2_ros_p_model_log']].max().max(), 0.15)
ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, label='y = x')

ax.set_xlabel('R² - Modelo Linear', fontsize=12, fontweight='bold')
ax.set_ylabel('R² - Modelo Log', fontsize=12, fontweight='bold')
ax.set_title('Comparação R²: Modelo Linear vs Log (Top 15 cada)', fontsize=14, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(alpha=0.3)

# SOLUÇÃO: Usar adjust_text sem arrowprops ou com configurações específicas
texts = []
for i, row in comparison_data.iterrows():
    if row['category'] in ['Top Linear', 'Top Log', 'Top Both']:
        text = ax.annotate(row['variable'], 
                          (row['R2_ros_p_model'], row['R2_ros_p_model_log']),
                          fontsize=8,
                          bbox=dict(boxstyle="round,pad=0.2", facecolor='white', alpha=0.8, edgecolor='gray'))
        texts.append(text)

# Opção 1: Sem arrowprops (mais simples)
adjust_text(texts,
            expand_points=(1.5, 1.5),
            expand_text=(1.2, 1.2), 
            force_points=0.5,
            force_text=0.5,
            lim=1000)

plt.xlim(0, max_val + 0.02)
plt.ylim(0, max_val + 0.02)
plt.tight_layout()
plt.show()

KeyError: 'R2_ros_p_model'