In [1]:
import pandas as pd
import module_heliquest_like
from module_heliquest_like import amino_acids

In [14]:
# import
df_AH = pd.read_excel('SourceData/AH_sequences.xlsx')
df_NEES = pd.read_csv('Output/NEES_summary.csv')
df_organelle = pd.read_csv('SourceData/AH_organelle_category.csv')

In [15]:
# Create AH name column
df_AH['AH_name'] = df_AH['Protein_Name'] + '-' + df_AH['AH#'].astype(str)
df_AH['AH_name'] = df_AH['AH_name'].str.replace('-1', '')
# lowercase
df_AH['AH_name'] = df_AH['AH_name'].str.lower()

# lowercase
df_NEES['AH'] = df_NEES['AH'].str.lower()

# merge
df = pd.merge(df_AH, df_NEES, left_on='AH_name', right_on='AH', how='inner')
df = pd.merge(df, df_organelle, left_on='AH_name', right_on='AH', how='inner')
# drop some columns
df = df.drop(columns=['AH_name', 'Protein_Name', 'AH#', 'AH_y'])
df = df.rename(columns={'AH_x': 'AH'})
# dropna
df = df.dropna(subset=['Median'])
df.reset_index(drop=True, inplace=True)

In [16]:
# Bin NEES median values by arbitrary thresholds
def categorize_NEES(value):
    thresholds = [0.65, 0.8]
    if value <= thresholds[0]:
        return 'No Binding'
    elif thresholds[0] < value <= thresholds[1]:
        return 'Subtle Binding'
    else:
        return 'Strong Binding'

# Apply the function to the Median column
df['NEES_binned'] = df['Median'].apply(categorize_NEES)

In [17]:
# AA composition analysis
for i, SEQ in enumerate(df.AA_seq.to_list()):
    aa_seq = module_heliquest_like.AA_seq(SEQ)

    aa_seq.calculate_hydrophobic_moment()
    aa_seq.extract_face_sequences()
    aa_seq.calculate_hydrophobicity()
    aa_seq.calculate_netcharge()
    aa_seq.calculate_dfactor()

    for AA in amino_acids:
        df.loc[i, 'phil_' + AA] = round(aa_seq.hydro_philic_face.count(AA) / len(aa_seq.hydro_philic_face), 2)
        df.loc[i, 'phob_' + AA] = round(aa_seq.hydro_phobic_face.count(AA) / len(aa_seq.hydro_philic_face), 2)
   
    df.loc[i, 'Length'] = len(SEQ)
    df.loc[i, 'Hydrophobicity'] = round(aa_seq.mean_hydrophobicity, 3)    
    df.loc[i, 'H_moment'] = aa_seq.mean_hydrophobic_moment
    df.loc[i, 'Netcharge'] = round(aa_seq.netcharge, 3)
    df.loc[i, 'Dfactor'] = round(aa_seq.dfactor, 3)

In [18]:
# Subgrouping amino acids
df['phil_S_T'] = df['phil_S'] + df['phil_T']
df['phil_R_K'] = df['phil_R'] + df['phil_K']
df['phil_L_I'] = df['phil_L'] + df['phil_I']
df['phil_F_W'] = df['phil_F'] + df['phil_W']
df['phil_M_C'] = df['phil_M'] + df['phil_C']
df['phil_D_E'] = df['phil_D'] + df['phil_E']
df['phil_N_Q'] = df['phil_N'] + df['phil_Q']

df['phob_S_T'] = df['phob_S'] + df['phob_T']
df['phob_R_K'] = df['phob_R'] + df['phob_K']
df['phob_L_I'] = df['phob_L'] + df['phob_I']
df['phob_F_W'] = df['phob_F'] + df['phob_W']
df['phob_M_C'] = df['phob_M'] + df['phob_C']
df['phob_D_E'] = df['phob_D'] + df['phob_E']
df['phob_N_Q'] = df['phob_N'] + df['phob_Q']

In [19]:
df.to_csv('Output/NEES_summary_with_heliquest.csv', index=False)