In [None]:
import pandas as pd
import module_heliquest_like
from module_heliquest_like import amino_acids

In [82]:
# import
df_AH = pd.read_excel('SourceData/AH_sequences.xlsx')
df_NEES = pd.read_csv('Output/NEES_summary.csv')
df_organelle = pd.read_csv('SourceData/AH_organelle_category.csv')

In [83]:
# Create AH name column
df_AH['AH_name'] = df_AH['Protein_Name'] + '-' + df_AH['AH#'].astype(str)
df_AH['AH_name'] = df_AH['AH_name'].str.replace('-1', '')
# lowercase
df_AH['AH_name'] = df_AH['AH_name'].str.lower()

# lowercase
df_NEES['AH'] = df_NEES['AH'].str.lower()

# merge
df = pd.merge(df_AH, df_NEES, left_on='AH_name', right_on='AH', how='inner')
df = pd.merge(df, df_organelle, left_on='AH_name', right_on='AH', how='inner')
# drop some columns
df = df.drop(columns=['AH_name', 'Protein_Name', 'AH#', 'AH_y'])
df = df.rename(columns={'AH_x': 'AH'})
# dropna
df = df.dropna(subset=['Median'])
df.reset_index(drop=True, inplace=True)

In [84]:
# Categorize the responses based on the given thresholds
def categorize_response(value):
    if value < 0.6:
        return "No Response"
    elif 0.6 <= value < 0.7:
        return "Mild"
    elif 0.7 <= value < 1.0:
        return "High"
    else:
        return "Very High"

# Apply the function to the Median column
df['Median_Response'] = df['Median'].apply(categorize_response)

In [85]:
# AA composition analysis
for i, SEQ in enumerate(df.AA_seq.to_list()):
    aa_seq = module_heliquest_like.AA_seq(SEQ)

    aa_seq.calculate_hydrophobic_moment()
    aa_seq.extract_face_sequences()
    aa_seq.calculate_hydrophobicity()
    aa_seq.calculate_netcharge()
    aa_seq.calculate_dfactor()

    for AA in amino_acids:
        df.loc[i, 'phil_' + AA] = round(aa_seq.hydro_philic_face.count(AA) / len(aa_seq.hydro_philic_face), 2)
        df.loc[i, 'phob_' + AA] = round(aa_seq.hydro_phobic_face.count(AA) / len(aa_seq.hydro_philic_face), 2)
   
    df.loc[i, 'Length'] = len(SEQ)
    df.loc[i, 'Hydrophobicity'] = round(aa_seq.mean_hydrophobicity, 3)    
    df.loc[i, 'H_moment'] = aa_seq.mean_hydrophobic_moment
    df.loc[i, 'Netcharge'] = round(aa_seq.netcharge, 3)
    df.loc[i, 'Dfactor'] = round(aa_seq.dfactor, 3)

In [86]:
df.to_csv('Output/NEES_summary_with_heliquest.csv', index=False)