In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel

In [3]:
df = pd.read_csv('./Output/NEES_summary_with_heliquest.csv')

In [4]:
print(df.columns)

Index(['AA_seq', 'AH', 'Median', 'Mean', 'Organelle', 'Median_Response',
       'phil_A', 'phob_A', 'phil_G', 'phob_G', 'phil_V', 'phob_V', 'phil_L',
       'phob_L', 'phil_I', 'phob_I', 'phil_F', 'phob_F', 'phil_W', 'phob_W',
       'phil_M', 'phob_M', 'phil_Y', 'phob_Y', 'phil_C', 'phob_C', 'phil_S',
       'phob_S', 'phil_T', 'phob_T', 'phil_R', 'phob_R', 'phil_K', 'phob_K',
       'phil_N', 'phob_N', 'phil_Q', 'phob_Q', 'phil_D', 'phob_D', 'phil_E',
       'phob_E', 'phil_H', 'phob_H', 'phil_P', 'phob_P', 'Length',
       'Hydrophobicity', 'H_moment', 'Netcharge', 'Dfactor'],
      dtype='object')


In [5]:
# column definitions
columns_heliquet =  df.columns[-5:]
# print(columns_heliquet)
columns_AAcomp = df.columns[6:-5]
# print(columns_AAcomp)
columns_AAcomp_phil = columns_AAcomp[list(range(0, len(columns_AAcomp), 2))]
# print(columns_AAcomp_phil)
columns_AAcomp_phob = columns_AAcomp[list(range(1, len(columns_AAcomp), 2))]
# print(columns_AAcomp_phob)

In [10]:
# Define the thresholds for binning y values based on the user's description
def categorize_y(value):
    thresholds = [0.6, 0.7, 1.0]
    if value <= thresholds[0]:
        return 'Low Response'
    elif thresholds[0] < value <= thresholds[1]:
        return 'Moderate Response'
    elif thresholds[1] < value <= thresholds[2]:
        return 'High Response'
    else:
        return 'Very High Response'

In [11]:
# Apply the transformation to create a categorical target variable
df['NEES_category'] = df['Median'].apply(categorize_y)

In [12]:
# Combining and summing the specified pairs of predictors
df = df.copy()
df['phil_S_T'] = df['phil_S'] + df['phil_T']
df['phil_R_K'] = df['phil_R'] + df['phil_K']
df['phil_L_I'] = df['phil_L'] + df['phil_I']
df['phil_F_W'] = df['phil_F'] + df['phil_W']
df['phil_M_C'] = df['phil_M'] + df['phil_C']
df['phil_D_E'] = df['phil_D'] + df['phil_E']
df['phil_N_Q'] = df['phil_N'] + df['phil_Q']

# New set of combined predictors
x_columns = ['phil_S_T', 'phil_R_K', 'phil_L_I', 'phil_F_W', 'phil_M_C', 'phil_D_E', 'phil_N_Q', 
                       'phil_A', 'phil_G', 'phil_V', 'phil_Y', 'phil_H', 'phil_P']

# Prepare features and target variable for training the Random Forest Classifier
X_train = df[x_columns]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['NEES_category'])

# Train a Random Forest Classifier with the combined predictors
rf_classifier_combined = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier_combined.fit(X_train, y_encoded)

# Get feature importances from the classifier
importances_combined = rf_classifier_combined.feature_importances_
feature_importance_combined = pd.Series(importances_combined, index=x_columns).sort_values(ascending=False)

# Select significant features based on a threshold
significant_features_combined = feature_importance_combined[feature_importance_combined >= 0.01]

significant_features_combined

phil_S_T    0.162055
phil_R_K    0.153369
phil_N_Q    0.118514
phil_H      0.096578
phil_D_E    0.094712
phil_L_I    0.074860
phil_A      0.066143
phil_G      0.063987
phil_F_W    0.063121
phil_M_C    0.041780
phil_Y      0.026601
phil_P      0.021974
phil_V      0.016305
dtype: float64

In [13]:
# Combining and summing the specified pairs of predictors
df = df.copy()
df['phob_S_T'] = df['phob_S'] + df['phob_T']
df['phob_R_K'] = df['phob_R'] + df['phob_K']
df['phob_L_I'] = df['phob_L'] + df['phob_I']
df['phob_F_W'] = df['phob_F'] + df['phob_W']
df['phob_M_C'] = df['phob_M'] + df['phob_C']
df['phob_D_E'] = df['phob_D'] + df['phob_E']
df['phob_N_Q'] = df['phob_N'] + df['phob_Q']

# New set of combined predictors
x_columns = ['phob_S_T', 'phob_R_K', 'phob_L_I', 'phob_F_W', 'phob_M_C', 'phob_D_E', 'phob_N_Q', 
                       'phob_A', 'phob_G', 'phob_V', 'phob_Y', 'phob_H', 'phob_P']

# Prepare features and target variable for training the Random Forest Classifier
X_train = df[x_columns]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['NEES_category'])

# Train a Random Forest Classifier with the combined predictors
rf_classifier_combined = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier_combined.fit(X_train, y_encoded)

# Get feature importances from the classifier
importances_combined = rf_classifier_combined.feature_importances_
feature_importance_combined = pd.Series(importances_combined, index=x_columns).sort_values(ascending=False)

# Select significant features based on a threshold
significant_features_combined = feature_importance_combined[feature_importance_combined >= 0.01]

significant_features_combined

phob_L_I    0.167795
phob_S_T    0.137514
phob_F_W    0.133867
phob_V      0.116836
phob_H      0.075788
phob_M_C    0.068868
phob_Y      0.059446
phob_A      0.054709
phob_N_Q    0.050492
phob_R_K    0.046706
phob_G      0.043765
phob_P      0.025185
phob_D_E    0.019030
dtype: float64