#### Script Description
This script loads a pre-processed dataset, computes the pearson correlation coefficient of the features and predictors.

*File Name:* 02_02_Feature_PCC_Analysis.ipynb

*Date:* 2024

*Created by:* Rob Alamgir  

*Version:* 1.0

*References:*

#### Import the relevant packages

In [None]:
# Import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import pearsonr
import plotly.graph_objects as go
import plotly.express as px

### Step 1: Load Data & prep the dataset 

In [None]:
# Load and preprocess data
data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locations_Updated_6.csv"
#data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locations_Updated_6_Summer_Data.csv"    # Summer data
#data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locations_Updated_6_Winter_Data.csv"    # Winter data
Complete_Data = pd.read_csv(data_path)

# Data preprocessing
Complete_Data['Date'] = pd.to_datetime(Complete_Data['Date'], format='%Y-%m-%d')
Complete_Data['Source_ID'] = Complete_Data['Source'].astype('category').cat.codes + 1
print(f"Rows and columns before removing NaNs: {Complete_Data.shape}")

# Filter and clean data
filtered_df = Complete_Data.dropna(subset=['SWCT_1_015']).copy()
filtered_df['BOFEK_2020_Physical Units'] = filtered_df['BOFEK_2020_Physical Units'].astype('category')
print(f"Rows and columns after removing NaNs: {filtered_df.shape}")
#print(filtered_df.columns.tolist())

# Select relevant variables for correlation matrix
vars_to_correlate = ["SWCT_1_015","S1_VSM","Planet_SWC",                              
                   "S1_Backscatter", "S2_NDVI", "S2_EVI", "S2_NDMI",         
                   "L8_9_LST", "MODIS_LAI",                                           
                   "STMP_1_015", "ATMP_f","PAIR_f","WTMP_f","WLEV_f","WIND_f",        
                   "WINS_f","RHUM_f","RAIN_f", "VPD_f", "PET","ET0",                  
                   "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT",    
                   "Available_soil_storage_mm", 
                     "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values",               
                   "Peat_Thickness_2022","BOFEK_2020_Physical Units"] 

all_variables_df = filtered_df[vars_to_correlate]      # Create a subset of the data containing only the relevant variables
cor_results = all_variables_df.corr(method='pearson')  # Compute the Pearson correlation matrix

## Plot Pearson Correlation Matrices

### Option 1 for plotting

In [None]:
plt.figure(figsize=(16, 14))
mask = np.triu(np.ones_like(cor_results, dtype=bool), k=1)  

ax = sns.heatmap(cor_results, mask=mask,
            annot=True, cmap="RdYlGn",
            center=0, linewidths=0.5, fmt=".2f",
            cbar_kws={'label': 'Correlation', 'shrink': 0.94},square=True)

#plt.title('Pearson Correlation Matrix')
plt.gca().xaxis.set_ticks_position('bottom')
plt.gca().tick_params(axis='x', bottom=True, top=False)
plt.gca().tick_params(axis='y', right=False)
plt.gca().yaxis.set_ticks_position('left')
plt.xticks(rotation=90)
plt.yticks(rotation=0)

labels = plt.gca().get_xticklabels() + plt.gca().get_yticklabels()

blue_labels = ["S1_VSM", "S1_Backscatter", "Planet_SWC", 
               "S2_NDVI", "S2_EVI", "S2_NDMI", "L8_9_LST", "MODIS_LAI"]
brown_labels = ["SWCT_1_015", "STMP_1_015", "WTMP_f", "WLEV_f", "ATMP_f", "PAIR_f", 
                "VPD_f", "WIND_f", "WINS_f", "RAIN_f", "RHUM_f", "NDVI_f", "PET", "ET0",
                "NEE_CO2_kg_day_ha_DAv_NT","NEE_CH4_kg_day_ha_DAv_NT"]
purple_labels = ["Available_soil_storage_mm", 
                 "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
                 "Peat_Thickness_2022","BOFEK_2020_Physical Units"]
for label in labels:
    if label.get_text() in blue_labels:
        label.set_color('blue')  
    elif label.get_text() in brown_labels:
        label.set_color('brown')  
    elif label.get_text() in purple_labels:
        label.set_color('purple')  
        
plt.show()

#save_path = "C:/Users/robdu/OneDrive - Radboud Universiteit/Master Thesis/Infographics and Graphics/Figures/PCC_Matrix_with_num_Winter.png"  
#ax.figure.savefig(save_path, dpi=600, bbox_inches='tight')
#print(f"Figure saved successfully at: {save_path}")

### Formatted PCC Matrix which is suitable for an A4 Page

In [None]:
plt.figure(figsize=(8, 10))  
mask = np.triu(np.ones_like(cor_results, dtype=bool), k=1)  # Mask upper triangle

ax = sns.heatmap(cor_results, mask=mask, annot=False, cmap="RdYlGn",
            center=0, linewidths=0.5, fmt=".2f",
            annot_kws={'size': 8},  
            cbar_kws={'label': 'Correlation', 'shrink': 0.67, 'aspect': 25, 'pad': 0.02}, 
            square=True)

plt.title('Pearson Correlation Matrix', fontsize=12)
plt.gca().xaxis.set_ticks_position('bottom')
plt.gca().tick_params(axis='x', bottom=True, top=False, labelsize=8)
plt.gca().tick_params(axis='y', right=False, labelsize=8)
plt.gca().yaxis.set_ticks_position('left')
plt.xticks(rotation=90)
plt.yticks(rotation=0)

labels = plt.gca().get_xticklabels() + plt.gca().get_yticklabels()
blue_labels = ["S1_VSM", "S1_Backscatter", "Planet_SWC", "Available_soil_storage_mm",
               "S2_NDVI", "S2_EVI", "S2_NDMI", "L8_9_LST", "MODIS_LAI"]
brown_labels = ["SWCT_1_015", "STMP_1_015", "WTMP_f", "WLEV_f", "ATMP_f", "PAIR_f", 
                "VPD_f", "WIND_f", "WINS_f", "RAIN_f", "RHUM_f", "NDVI_f", "PET", "ET0",
                "NEE_CO2_kg_day_ha_DAv_NT","NEE_CH4_kg_day_ha_DAv_NT"]
purple_labels = ["Available_soil_storage_mm", 
                 "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
                 "Peat_Thickness_2022","BOFEK_2020_Physical Units"]
for label in labels:
    if label.get_text() in blue_labels:
        label.set_color('blue')  
    elif label.get_text() in brown_labels:
        label.set_color('brown')  
    elif label.get_text() in purple_labels:
        label.set_color('purple')  
plt.show()

#save_path = "C:/Users/robdu/OneDrive - Radboud Universiteit/Master Thesis/Infographics and Graphics/Figures/PCC_Matrix.png"  
#ax.figure.savefig(save_path, dpi=600, bbox_inches='tight')
#print(f"Figure saved successfully at: {save_path}")

#### Option 2 for plotting (Plotly)

In [None]:
# Compute the correlation matrix
cor_results = all_variables_df.corr(method='pearson')

# Mask the upper triangle of the correlation matrix
mask = np.triu(np.ones_like(cor_results, dtype=bool), k=1)
masked_cor_results = cor_results.copy()
masked_cor_results[mask] = np.nan

# Create the heatmap
fig = go.Figure(data=go.Heatmap(
    z=masked_cor_results.values,
    x=cor_results.columns,
    y=cor_results.columns,
    colorscale='RdYlGn',
    colorbar=dict(title='Preason Correlation', title_side='right'),  # Rotate colorbar title vertically
    zmin=-1, zmax=1,
    showscale=True,
    hoverongaps=False,  # Hide NaN cells
    hovertemplate='<b>PCC:</b> %{z}<br><b>X-axis:</b> %{x}<br><b>Y-axis:</b> %{y}<extra></extra>'  # Hover showing PCC and axis labels
))

# Update layout for better readability
fig.update_layout(
    title={'text': 'Pearson Correlation Matrix', 'x': 0.5, 'xanchor': 'center', 'font': {'size': 16, 'family': 'Arial', 'weight': 'bold'}},  # Center and bold the title
    xaxis=dict(tickangle=90, title=None),  # No x-axis title
    yaxis=dict(title=None),  # No y-axis title
    autosize=True,
    xaxis_tickvals=list(range(len(cor_results.columns))),
    yaxis_tickvals=list(range(len(cor_results.columns))),
    xaxis_ticktext=cor_results.columns,
    yaxis_ticktext=cor_results.columns,
    font=dict(size=10),
    template='plotly_white',
    width=1000,
    height=1000
)

#### Option 3: PCC correlation matrix with both triangles and values

In [None]:
plt.figure(figsize=(16, 14))
sns.heatmap(cor_results, annot=True, cmap="RdYlGn", center=0, linewidths=0.5, fmt=".2f",
            cbar_kws={'label': 'Correlation', 'shrink': 0.94}, square=True)

plt.title('Pearson Correlation Matrix')
plt.xlabel('Variables')
plt.ylabel('Variables')
plt.gca().xaxis.set_ticks_position('top')
plt.xticks(rotation=90)
plt.yticks(rotation=0)

plt.gca().add_patch(plt.Rectangle((0, 0), 1, 1, transform=plt.gca().transAxes, 
                                  color='black', linewidth=3, fill=False))

labels = plt.gca().get_xticklabels() + plt.gca().get_yticklabels()
blue_labels = ["S1_VSM", "S1_Backscatter", "Planet_SWC", "Available_soil_storage_mm",
               "S2_NDVI", "S2_EVI", "S2_NDMI", "L8_9_LST", "MODIS_LAI"]
brown_labels = ["SWCT_1_015", "STMP_1_015", "WTMP_f", "WLEV_f", "ATMP_f", "PAIR_f", 
                "VPD_f", "WIND_f", "WINS_f", "RAIN_f", "RHUM_f", "NDVI_f", "PET", "ET0",
                "NEE_CO2_kg_day_ha_DAv_NT","NEE_CH4_kg_day_ha_DAv_NT"]
purple_labels = ["Available_soil_storage_mm", 
                 "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
                 "Peat_Thickness_2022","BOFEK_2020_Physical Units"]
for label in labels:
    if label.get_text() in blue_labels:
        label.set_color('blue')  # Set blue color for selected labels
    elif label.get_text() in brown_labels:
        label.set_color('brown')  # Set brown color for selected labels
    elif label.get_text() in purple_labels:
        label.set_color('purple')  # Set brown color for selected labels
plt.show()

#### Option 4: PCC correlation matrix with both triangles 

In [None]:
plt.figure(figsize=(14, 12))
sns.heatmap(cor_results, annot=False, cmap="RdYlGn", center=0, linewidths=0.5, fmt=".2f",
            cbar_kws={'label': 'Correlation', 'shrink': 0.94}, square=True)

plt.title('Heatmap of Pairwise Correlations')
plt.xlabel('Variables')
plt.ylabel('Variables')
plt.gca().xaxis.set_ticks_position('top')
plt.xticks(rotation=90)
plt.yticks(rotation=0)

plt.gca().add_patch(plt.Rectangle((0, 0), 1, 1, transform=plt.gca().transAxes, 
                                  color='black', linewidth=3, fill=False))

labels = plt.gca().get_xticklabels() + plt.gca().get_yticklabels()
blue_labels = ["S1_VSM", "S1_Backscatter", "Planet_SWC", "Available_soil_storage_mm",
               "S2_NDVI", "S2_EVI", "S2_NDMI", "L8_9_LST", "MODIS_LAI"]
brown_labels = ["SWCT_1_015", "STMP_1_015", "WTMP_f", "WLEV_f", "ATMP_f", "PAIR_f", 
                "VPD_f", "WIND_f", "WINS_f", "RAIN_f", "RHUM_f", "NDVI_f", "PET", "ET0",
                "NEE_CO2_kg_day_ha_DAv_NT","NEE_CH4_kg_day_ha_DAv_NT"]
purple_labels = ["Available_soil_storage_mm", 
                 "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
                 "Peat_Thickness_2022","BOFEK_2020_Physical Units"]
for label in labels:
    if label.get_text() in blue_labels:
        label.set_color('blue')  
    elif label.get_text() in brown_labels:
        label.set_color('brown')  
    elif label.get_text() in purple_labels:
        label.set_color('purple')  
plt.show()

## Plot PCC bar graphs

### Plot variant 1: PCC of "SWCT_1_015" against all other variables (coloured by data source categories)

In [None]:
variables = ["SWCT_1_015","S1_VSM","Planet_SWC", 
             "S1_Backscatter", "S2_NDVI", "S2_EVI", "S2_NDMI", 
             "L8_9_LST", "MODIS_LAI", 
             "STMP_1_015", "ATMP_f","PAIR_f","WTMP_f","WLEV_f","WIND_f",
             "WINS_f","RHUM_f","RAIN_f", "VPD_f", "PET","ET0", 
             "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT", 
             "Available_soil_storage_mm", 
             "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
             "Peat_Thickness_2022","BOFEK_2020_Physical Units"]

categories = {"Remotely Sensed Data": ["SWCT_1_015","S1_VSM","Planet_SWC", 
                                       "S1_Backscatter", "S2_NDVI", "S2_EVI", "S2_NDMI", "L8_9_LST", "MODIS_LAI"],
              "Hybrid Sensed Data": ["Available_soil_storage_mm", "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
                                     "Peat_Thickness_2022","BOFEK_2020_Physical Units"],
              "Ground Sensed Data": ["STMP_1_015", "ATMP_f","PAIR_f","WTMP_f","WLEV_f","WIND_f", 
                                     "WINS_f","RHUM_f","RAIN_f", "VPD_f", "PET","ET0", "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT"]
              }

target_variable = "SWCT_1_015"    # Calculate correlations
correlation_data = []

for category, vars_in_category in categories.items():
    for variable in vars_in_category:
        if variable != target_variable:
            correlation = filtered_df[[target_variable, variable]].corr().iloc[0, 1]
            correlation_data.append({
                "Feature": variable,
                "Correlation": correlation,
                "Datatype": category})

correlation_df = pd.DataFrame(correlation_data)   # Convert to DataFrame
correlation_df = correlation_df.sort_values('Correlation', ascending=False)   # Sort the correlation data by the 'Correlation' column in descending order

# Plotting the sorted grouped bar plot
plt.figure(figsize=(6.5, 8))
ax1 = sns.barplot(data=correlation_df, x="Correlation", y="Feature",
                 hue="Datatype",  dodge=False,  # Ensures that the bars are stacked together rather than side-by-side
                 palette={"Remotely Sensed Data": "blue", "Hybrid Sensed Data": "purple","Ground Sensed Data": "brown" },
                 width=0.8)

xticks = plt.xticks()[0]  # Add vertical dotted lines at each x-axis tick
for tick in xticks:
    plt.axvline(x=tick, linestyle="--", color="gray", alpha=0.3)
    
plt.axvline(0, color='black', linestyle='--', linewidth=0.8)  # Add a vertical line at zero
#plt.title("Pearson Correlation with Target Variable (SWCT_1_015)",fontsize=14, loc='right')
plt.xlabel("Pearson Correlation", fontsize=14)
plt.ylabel("Features",fontsize=18, labelpad=20)
plt.legend(title="Datatype",loc='lower right', fontsize=7)
plt.tight_layout()
plt.show()

#save_path = "C:/Users/robdu/OneDrive - Radboud Universiteit/Master Thesis/Infographics and Graphics/Figures/PCC_Bar_Plot_1_Winter.png"  
#ax1.figure.savefig(save_path, dpi=600, bbox_inches='tight')
#print(f"Figure saved successfully at: {save_path}")

### Plot variant 2: PCC of "SWCT_1_015" against all other variables (coloured by physical data categories)

In [None]:
variables = ["SWCT_1_015","S1_VSM","Planet_SWC", 
             "S1_Backscatter", "S2_NDVI","S2_EVI", "S2_NDMI", 
             "L8_9_LST", "MODIS_LAI", 
             "STMP_1_015", "ATMP_f","PAIR_f","WTMP_f","WLEV_f","WIND_f",
             "WINS_f","RHUM_f","RAIN_f", "VPD_f", "PET","ET0", 
             "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT", 
             "Available_soil_storage_mm", 
             "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
             "Peat_Thickness_2022","BOFEK_2020_Physical Units"]

categories = {"Hydrological Data": ["S1_VSM", "S1_Backscatter", "Planet_SWC", "Available_soil_storage_mm",
                                    "WTMP_f", "WLEV_f", "S2_NDMI"],
              "Vegetative Index": ["S2_NDVI",  "S2_EVI", "S2_NDMI", "MODIS_LAI"],
              "Meteorological Data": ["L8_9_LST", "ATMP_f", "PAIR_f", "WIND_f", "WINS_f", "RHUM_f", "RAIN_f",
                                      "VPD_f", "PET", "ET0", "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT"],
              "Soil Data": ["STMP_1_015", "SOM_2023_0_5_values", "BD_0_5_values","Clay_0_5_values",
                            "BOFEK_2020_Physical Units", "Peat_Thickness_2022"]}

target_variable = "SWCT_1_015"    # Calculate correlations
correlation_data = []

for category, vars_in_category in categories.items():
    for variable in vars_in_category:
        if variable != target_variable:
            correlation = filtered_df[[target_variable, variable]].corr().iloc[0, 1]
            correlation_data.append({
                "Feature": variable,
                "Correlation": correlation,
                "Datatype": category})

correlation_df = pd.DataFrame(correlation_data)
correlation_df = correlation_df.sort_values('Correlation', ascending=False)

# Plotting the sorted grouped bar plot
plt.figure(figsize=(6, 8))
ax2 = sns.barplot(data=correlation_df, x="Correlation", y="Feature",
                  hue="Datatype",
                  dodge=False,  # Ensures that the bars are stacked together rather than side-by-side
                  palette={"Hydrological Data": "blue", "Vegetative Index": "green",
                           "Meteorological Data": "purple","Soil Data": "brown"},
                  width=0.8)


xticks = plt.xticks()[0]  # Add vertical dotted lines at each x-axis tick
for tick in xticks:
    plt.axvline(x=tick, linestyle="--", color="gray", alpha=0.3)
    
plt.axvline(0, color='black', linestyle='--', linewidth=0.6)  # Add a vertical line at zero
#plt.title("Pearson Correlation with Target Variable (SWCT_1_015)", fontsize=14, loc='right')
plt.xlabel("Pearson Correlation", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.legend(title="Datatype",loc='lower right', fontsize=7)
plt.tight_layout()
plt.show()

#save_path = "C:/Users/robdu/OneDrive - Radboud Universiteit/Master Thesis/Infographics and Graphics/Figures/PCC_Bar_Plot_2.png"  
#ax2.figure.savefig(save_path, dpi=600, bbox_inches='tight')
#print(f"Figure saved successfully at: {save_path}")

### Plot variant 3: PCC of "S1_VSM" against all other variables (coloured by data source categories)

In [None]:
variables = ["SWCT_1_015","S1_VSM","Planet_SWC", 
             "S1_Backscatter", "S2_NDVI", "S2_EVI", "S2_NDMI", 
             "L8_9_LST", "MODIS_LAI", 
             "STMP_1_015", "ATMP_f","PAIR_f","WTMP_f","WLEV_f","WIND_f",
             "WINS_f","RHUM_f","RAIN_f", "VPD_f", "PET","ET0", 
             "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT", 
             "Available_soil_storage_mm", 
             "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
             "Peat_Thickness_2022","BOFEK_2020_Physical Units"]

categories = {"Remotely Sensed Data": ["SWCT_1_015","S1_VSM","Planet_SWC", 
                                       "S1_Backscatter", "S2_NDVI", "S2_EVI", "S2_NDMI", "L8_9_LST", "MODIS_LAI"],
              "Ground Sensed Data": ["STMP_1_015", "ATMP_f","PAIR_f","WTMP_f","WLEV_f","WIND_f", 
                                     "WINS_f","RHUM_f","RAIN_f", "VPD_f", "PET","ET0", "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT"],
              "Hybrid Sensed Data": ["Available_soil_storage_mm", "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
                                     "Peat_Thickness_2022","BOFEK_2020_Physical Units"]}

target_variable = "S1_VSM"    # Calculate correlations
correlation_data = []

for category, vars_in_category in categories.items():
    for variable in vars_in_category:
        if variable != target_variable:
            correlation = filtered_df[[target_variable, variable]].corr().iloc[0, 1]
            correlation_data.append({
                "Feature": variable,
                "Correlation": correlation,
                "Datatype": category})

correlation_df = pd.DataFrame(correlation_data)   # Convert to DataFrame
correlation_df = correlation_df.sort_values('Correlation', ascending=False)   # Sort the correlation data by the 'Correlation' column in descending order

# Plotting the sorted grouped bar plot
plt.figure(figsize=(6, 8))
ax3 = sns.barplot(data=correlation_df, x="Correlation", y="Feature",
                 hue="Datatype",  dodge=False,  # Ensures that the bars are stacked together rather than side-by-side
                 palette={"Remotely Sensed Data": "blue", "Ground Sensed Data": "brown", "Hybrid Sensed Data": "purple"},
                 width=0.8)

xticks = plt.xticks()[0]  # Add vertical dotted lines at each x-axis tick
for tick in xticks:
    plt.axvline(x=tick, linestyle="--", color="gray", alpha=0.3)
    
plt.axvline(0, color='black', linestyle='--', linewidth=0.8)  # Add a vertical line at zero
#plt.title("Pearson Correlation with Target Variable (S1_VSM)",fontsize=14, loc='right')
plt.xlabel("Pearson Correlation", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.legend(title="Datatype",loc='lower right', fontsize=7)
plt.tight_layout()
plt.show()

save_path = "C:/Users/robdu/OneDrive - Radboud Universiteit/Master Thesis/Infographics and Graphics/Figures/PCC_Bar_Plot_3.png"  
ax3.figure.savefig(save_path, dpi=600, bbox_inches='tight')
print(f"Figure saved successfully at: {save_path}")

### Plot variant 4: PCC of "S1_VSM" against all other variables (coloured by physical data categories)

In [None]:
variables = ["SWCT_1_015","S1_VSM","Planet_SWC", 
             "S1_Backscatter", "S2_NDVI", "S2_EVI", "S2_NDMI", 
             "L8_9_LST", "MODIS_LAI", 
             "STMP_1_015", "ATMP_f","PAIR_f","WTMP_f","WLEV_f","WIND_f",
             "WINS_f","RHUM_f","RAIN_f", "VPD_f", "PET","ET0", 
             "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT", 
             "Available_soil_storage_mm", 
             "SOM_2023_0_5_values","BD_0_5_values","Clay_0_5_values", 
             "Peat_Thickness_2022","BOFEK_2020_Physical Units"]

categories = {"Hydrological Data": ["S1_VSM", "S1_Backscatter", "Planet_SWC", "Available_soil_storage_mm",
                                    "WTMP_f", "WLEV_f", "S2_NDMI"],
              "Vegetative Index": ["S2_NDVI",  "S2_EVI", "S2_NDMI", "MODIS_LAI"],
              "Meteorological Data": ["L8_9_LST", "ATMP_f", "PAIR_f", "WIND_f", "WINS_f", "RHUM_f", "RAIN_f",
                                      "VPD_f", "PET", "ET0", "NEE_CO2_kg_day_ha_DAv_NT", "NEE_CH4_kg_day_ha_DAv_NT"],
              "Soil Data": ["STMP_1_015", "SOM_2023_0_5_values", "BD_0_5_values","Clay_0_5_values",
                            "BOFEK_2020_Physical Units", "Peat_Thickness_2022"]}

target_variable = "S1_VSM"    # Calculate correlations
correlation_data = []

for category, vars_in_category in categories.items():
    for variable in vars_in_category:
        if variable != target_variable:
            correlation = filtered_df[[target_variable, variable]].corr().iloc[0, 1]
            correlation_data.append({
                "Feature": variable,
                "Correlation": correlation,
                "Datatype": category})

correlation_df = pd.DataFrame(correlation_data)
correlation_df = correlation_df.sort_values('Correlation', ascending=False)

# Plotting the sorted grouped bar plot
plt.figure(figsize=(6, 8))
ax4 = sns.barplot(data=correlation_df, x="Correlation", y="Feature",
                  hue="Datatype",
                  dodge=False,  # Ensures that the bars are stacked together rather than side-by-side
                  palette={"Hydrological Data": "blue", "Vegetative Index": "green",
                           "Meteorological Data": "purple","Soil Data": "brown"},
                  width=0.8)


xticks = plt.xticks()[0]  # Add vertical dotted lines at each x-axis tick
for tick in xticks:
    plt.axvline(x=tick, linestyle="--", color="gray", alpha=0.3)
    
plt.axvline(0, color='black', linestyle='--', linewidth=0.6)  # Add a vertical line at zero
#plt.title("Pearson Correlation with Target Variable (SWCT_1_015)", fontsize=14, loc='right')
plt.xlabel("Pearson Correlation", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.legend(title="Datatype",loc='lower right', fontsize=7)
plt.tight_layout()
plt.show()

#save_path = "C:/Users/robdu/OneDrive - Radboud Universiteit/Master Thesis/Infographics and Graphics/Figures/PCC_Bar_Plot_4.png"  
#ax4.figure.savefig(save_path, dpi=600, bbox_inches='tight')
#print(f"Figure saved successfully at: {save_path}")

## Tests normality of each numerical column using the Shapiro-Wilk test.
#### ✅ Visualizes distributions using histograms and Q-Q plots.
#### ✅ Handles missing & infinite values properly.
#### ✅ Stores and prints results in a DataFrame for easy analysis.

In [None]:
numerical_columns = filtered_df.select_dtypes(include=['float64', 'int64']).columns # Select numerical columns
results = []                                                                       # Initialize a list to store test results

# Loop through numerical columns
for column in numerical_columns:    
    finite_data = filtered_df[column].replace([np.inf, -np.inf], np.nan).dropna()  # Remove non-finite values (inf, -inf) and drop NaNs        
    if len(finite_data) == 0:                                                      # Skip if no valid data remains
        print(f"Skipping {column}: no valid data after cleaning.")
        continue     
    stat, p = stats.shapiro(finite_data)                                           # Perform Shapiro-Wilk Test
    results.append({'Column': column, 'Shapiro-Wilk Statistic': stat, 'p-value': p})      
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))                                # Create subplots for histogram and Q-Q plot
    axes[0].hist(finite_data, bins=30, alpha=0.7, color='blue')
    axes[0].set_title(f"Histogram for {column}")
    axes[0].set_xlabel(column)
    axes[0].set_ylabel("Frequency")  
    stats.probplot(finite_data, dist="norm", plot=axes[1])                         # Q-Q Plot
    axes[1].set_title(f"Q-Q Plot for {column}")
    plt.tight_layout()
    plt.show()

results_df = pd.DataFrame(results)
print(results_df)