In [6]:
import pandas as pd
import os
import glob

# Define the path pattern to search for CSV files
path_pattern = '../../Data/Data_Exploration/descriptive_stats_*.csv'

# Find all CSV files matching the pattern
csv_files = glob.glob(path_pattern)

# Initialize dictionaries to store correlations by type
correlations_data = {}

# Process each CSV file
for file_path in csv_files:
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Check if R2_ros_p column exists
        if 'R2_ros_p' in df.columns:
            # Extract filename to determine type
            filename = os.path.basename(file_path)
            
            # Determine model type and transformation
            if 'model' in filename.lower() and 'log' in filename.lower():
                col_suffix = 'model_log'
            elif 'model' in filename.lower():
                col_suffix = 'model'
            elif 'final' in filename.lower() and 'log' in filename.lower():
                col_suffix = 'final_log'
            elif 'final' in filename.lower():
                col_suffix = 'final'
            else:
                print(f"Warning: Could not determine type for {filename}")
                continue
            
            # Extract variable names and R2_ros_p values
            for idx, row in df.iterrows():
                if pd.notna(row['R2_ros_p']):
                    variable_name = df.iloc[idx, 0]  # First column contains variable names
                    
                    # Initialize variable entry if not exists
                    if variable_name not in correlations_data:
                        correlations_data[variable_name] = {'variable': variable_name}
                    
                    # Add correlation value for this file type
                    correlations_data[variable_name][f'R2_ros_p_{col_suffix}'] = row['R2_ros_p']
                    
        else:
            print(f"Warning: R2_ros_p column not found in {file_path}")
            
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

# Create DataFrame from collected correlations
if correlations_data:
    # Convert dictionary to DataFrame
    correlations_df = pd.DataFrame(correlations_data.values())
    
    # Reorder columns to match your desired structure
    desired_columns = ['variable', 'R2_ros_p_model', 'R2_ros_p_model_log', 
                       'R2_ros_p_final', 'R2_ros_p_final_log']
    
    # Add missing columns with NaN values if they don't exist
    for col in desired_columns:
        if col not in correlations_df.columns:
            correlations_df[col] = None
    
    # Reorder columns
    correlations_df = correlations_df[desired_columns]
    
    # Save to new CSV file
    output_file = '../../Data/Data_Exploration/all_ros_p_correlations.csv'
    correlations_df.to_csv(output_file, index=False)
    
    print(f"Successfully created {output_file}")
    print(f"Total variables: {len(correlations_df)}")
    print(f"Files processed: {len(csv_files)}")
    print(f"\nDataFrame structure:")
    print(correlations_df.head())
    
else:
    print("No correlation data found in any of the files")

Successfully created ../../Data/Data_Exploration/all_ros_p_correlations.csv
Total variables: 120
Files processed: 4

DataFrame structure:
     variable  R2_ros_p_model  R2_ros_p_model_log  R2_ros_p_final  \
0  1_3y_fir_p        0.002981            0.112848        0.001223   
1  3_8y_fir_p        0.000166            0.032336        0.000627   
2  8_ny_fir_p        0.002431            0.002648        0.002349   
3    BLH_m_av        0.027352            0.068334        0.028763   
4    BLH_m_rt        0.002414            0.001685        0.014172   

   R2_ros_p_final_log  
0            0.109417  
1            0.035671  
2            0.006493  
3            0.061676  
4            0.002779  
