In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

In [4]:
df = pd.read_csv('./Output/NEES_summary_with_heliquest.csv')

In [5]:
print(df.columns)

Index(['AA_seq', 'AH', 'Median', 'Mean', 'Organelle', 'NEES_binned', 'phil_A',
       'phob_A', 'phil_G', 'phob_G', 'phil_V', 'phob_V', 'phil_L', 'phob_L',
       'phil_I', 'phob_I', 'phil_F', 'phob_F', 'phil_W', 'phob_W', 'phil_M',
       'phob_M', 'phil_Y', 'phob_Y', 'phil_C', 'phob_C', 'phil_S', 'phob_S',
       'phil_T', 'phob_T', 'phil_R', 'phob_R', 'phil_K', 'phob_K', 'phil_N',
       'phob_N', 'phil_Q', 'phob_Q', 'phil_D', 'phob_D', 'phil_E', 'phob_E',
       'phil_H', 'phob_H', 'phil_P', 'phob_P', 'Length', 'Hydrophobicity',
       'H_moment', 'Netcharge', 'Dfactor'],
      dtype='object')


In [6]:
# column definitions
columns_heliquet =  df.columns[-5:]
print(columns_heliquet)
columns_AAcomp = df.columns[6:-5]
print(columns_AAcomp)
columns_AAcomp_phil = columns_AAcomp[list(range(0, len(columns_AAcomp), 2))]
print(columns_AAcomp_phil)
columns_AAcomp_phob = columns_AAcomp[list(range(1, len(columns_AAcomp), 2))]
print(columns_AAcomp_phob)

Index(['Length', 'Hydrophobicity', 'H_moment', 'Netcharge', 'Dfactor'], dtype='object')
Index(['phil_A', 'phob_A', 'phil_G', 'phob_G', 'phil_V', 'phob_V', 'phil_L',
       'phob_L', 'phil_I', 'phob_I', 'phil_F', 'phob_F', 'phil_W', 'phob_W',
       'phil_M', 'phob_M', 'phil_Y', 'phob_Y', 'phil_C', 'phob_C', 'phil_S',
       'phob_S', 'phil_T', 'phob_T', 'phil_R', 'phob_R', 'phil_K', 'phob_K',
       'phil_N', 'phob_N', 'phil_Q', 'phob_Q', 'phil_D', 'phob_D', 'phil_E',
       'phob_E', 'phil_H', 'phob_H', 'phil_P', 'phob_P'],
      dtype='object')
Index(['phil_A', 'phil_G', 'phil_V', 'phil_L', 'phil_I', 'phil_F', 'phil_W',
       'phil_M', 'phil_Y', 'phil_C', 'phil_S', 'phil_T', 'phil_R', 'phil_K',
       'phil_N', 'phil_Q', 'phil_D', 'phil_E', 'phil_H', 'phil_P'],
      dtype='object')
Index(['phob_A', 'phob_G', 'phob_V', 'phob_L', 'phob_I', 'phob_F', 'phob_W',
       'phob_M', 'phob_Y', 'phob_C', 'phob_S', 'phob_T', 'phob_R', 'phob_K',
       'phob_N', 'phob_Q', 'phob_D', 'phob_E', 'p

In [7]:
# Function to plot each graph
def plot_correlation(df, x, y, ax):
    correlation_coefficient = df[x].corr(df[y])
    
    sns.scatterplot(x=df[x], y=df[y], ax=ax)
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_title(f'Correlation between {x} and {y}')
    
    # Annotate the correlation coefficient
    ax.text(0.95, 0.7, f'Corr. Coeff.: {correlation_coefficient:.2f}', 
            transform=ax.transAxes, fontsize=12, verticalalignment='bottom', 
            horizontalalignment='right', bbox=dict(facecolor='white', alpha=0.5))

In [8]:
x_columns = columns_heliquet
y_column = 'Median'

# Generate the plots and save them to a single PDF file with 6 graphs per page
with PdfPages('./Output/correlation_plots_helixparameters.pdf') as pdf:
    for i in range(0, len(x_columns), 6):
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))  # Create a 2x3 grid
        axes = axes.flatten()  # Flatten the 2D array of axes
        for ax, x in zip(axes, x_columns[i:i+6]):
            plot_correlation(df, x, y_column, ax)
        for ax in axes[len(x_columns[i:i+6]):]:
            ax.remove()  # Remove any unused axes
        pdf.savefig(fig)  # Save the current figure to the PDF
        plt.close(fig)  # Close the figure to free memory

In [9]:
x_columns = columns_AAcomp_phil
y_column = 'Median'

# Generate the plots and save them to a single PDF file with 6 graphs per page
with PdfPages('./Output/correlation_plots_phil.pdf') as pdf:
    for i in range(0, len(x_columns), 6):
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))  # Create a 2x3 grid
        axes = axes.flatten()  # Flatten the 2D array of axes
        for ax, x in zip(axes, x_columns[i:i+6]):
            plot_correlation(df, x, y_column, ax)
        for ax in axes[len(x_columns[i:i+6]):]:
            ax.remove()  # Remove any unused axes
        pdf.savefig(fig)  # Save the current figure to the PDF
        plt.close(fig)  # Close the figure to free memory

In [10]:
x_columns = columns_AAcomp_phob
y_column = 'Median'

# Generate the plots and save them to a single PDF file with 6 graphs per page
with PdfPages('./Output/correlation_plots_phob.pdf') as pdf:
    for i in range(0, len(x_columns), 6):
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))  # Create a 2x3 grid
        axes = axes.flatten()  # Flatten the 2D array of axes
        for ax, x in zip(axes, x_columns[i:i+6]):
            plot_correlation(df, x, y_column, ax)
        for ax in axes[len(x_columns[i:i+6]):]:
            ax.remove()  # Remove any unused axes
        pdf.savefig(fig)  # Save the current figure to the PDF
        plt.close(fig)  # Close the figure to free memory

#### Helix parameters excluding too long helices

In [11]:
x_columns = columns_heliquet
y_column = 'Median'
df = df.copy()
df = df[df['Length'] <= 30]

# Generate the plots and save them to a single PDF file with 6 graphs per page
with PdfPages('./Output/correlation_plots_helixparameters_excludingLongHelics.pdf') as pdf:
    for i in range(0, len(x_columns), 6):
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))  # Create a 2x3 grid
        axes = axes.flatten()  # Flatten the 2D array of axes
        for ax, x in zip(axes, x_columns[i:i+6]):
            plot_correlation(df, x, y_column, ax)
        for ax in axes[len(x_columns[i:i+6]):]:
            ax.remove()  # Remove any unused axes
        pdf.savefig(fig)  # Save the current figure to the PDF
        plt.close(fig)  # Close the figure to free memory