In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
df = pd.read_csv('./Output/NEES_summary_with_heliquest.csv')

In [4]:
df_extract = df[['Median', 'Length', 'Hydrophobicity', 'H_moment', 'Netcharge', 'Dfactor']]

In [5]:
# Calculate the correlation matrix
correlation_matrix = df_extract.corr()

# Extract the correlation coefficients for the 'Median' and 'Mean' columns
correlation_with_median = correlation_matrix['Median'].sort_values(ascending=False)

# Display the correlation coefficients
correlation_with_median

Median            1.000000
Hydrophobicity    0.284061
Netcharge         0.048262
Dfactor           0.025718
Length            0.021208
H_moment         -0.138342
Name: Median, dtype: float64

In [25]:
# Function to plot each graph
def plot_correlation(df, x, y, ax):
    correlation_coefficient = df[x].corr(df[y])
    
    sns.scatterplot(x=df[x], y=df[y], ax=ax)
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_title(f'Correlation between {x} and {y}')
    
    # Annotate the correlation coefficient
    plt.text(0.2, 3, f"Correlation coefficient: " +  str(round(correlation_coefficient, 3)))

x_columns = ['Length', 'Hydrophobicity', 'H_moment', 'Netcharge', 'Dfactor']
y_column = 'Median'

# Generate the plots and save them to a single PDF file
with PdfPages('correlation_plots.pdf') as pdf:
    for x in x_columns:
        fig, ax = plt.subplots(figsize=(6, 6))
        plot_correlation(df_extract, x, y_column, ax)
        pdf.savefig(fig)  # Save the current figure to the PDF
        plt.close(fig)  # Close the figure to free memory