In [1]:
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('./Output/NEES_summary_with_heliquest.csv')

In [4]:
print(df.columns)

Index(['AA_seq', 'AH', 'Median', 'Mean', 'Organelle', 'NEES_binned', 'phil_A',
       'phob_A', 'phil_G', 'phob_G', 'phil_V', 'phob_V', 'phil_L', 'phob_L',
       'phil_I', 'phob_I', 'phil_F', 'phob_F', 'phil_W', 'phob_W', 'phil_M',
       'phob_M', 'phil_Y', 'phob_Y', 'phil_C', 'phob_C', 'phil_S', 'phob_S',
       'phil_T', 'phob_T', 'phil_R', 'phob_R', 'phil_K', 'phob_K', 'phil_N',
       'phob_N', 'phil_Q', 'phob_Q', 'phil_D', 'phob_D', 'phil_E', 'phob_E',
       'phil_H', 'phob_H', 'phil_P', 'phob_P', 'Length', 'Hydrophobicity',
       'H_moment', 'Netcharge', 'Dfactor', 'phil_S_T', 'phil_R_K', 'phil_L_I',
       'phil_F_W', 'phil_M_C', 'phil_D_E', 'phil_N_Q', 'phob_S_T', 'phob_R_K',
       'phob_L_I', 'phob_F_W', 'phob_M_C', 'phob_D_E', 'phob_N_Q'],
      dtype='object')


In [5]:
# column definitions
columns_heliquet =  df.columns[-5:]
# print(columns_heliquet)
columns_AAcomp = df.columns[6:-5]
# print(columns_AAcomp)
columns_AAcomp_phil = columns_AAcomp[list(range(0, len(columns_AAcomp), 2))]
# print(columns_AAcomp_phil)
columns_AAcomp_phob = columns_AAcomp[list(range(1, len(columns_AAcomp), 2))]
# print(columns_AAcomp_phob)

#### Random Forest Classifier: seperately for each face or combined

In [None]:
# Selecting hydrophilic face composition
x_columns = ['phil_S_T', 'phil_R_K', 'phil_L_I', 'phil_F_W', 'phil_M_C', 'phil_D_E', 'phil_N_Q', 
                       'phil_A', 'phil_G', 'phil_V', 'phil_Y', 'phil_H', 'phil_P']

# Prepare features and target variable for training the Random Forest Classifier
X_train = df[x_columns]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['NEES_binned'])

# Train a Random Forest Classifier with the combined predictors
rf_classifier_combined = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier_combined.fit(X_train, y_encoded)

# Get feature importances from the classifier
importances_combined = rf_classifier_combined.feature_importances_
feature_importance_combined = pd.Series(importances_combined, index=x_columns).sort_values(ascending=False)

# Select significant features based on a threshold
significant_features_combined = feature_importance_combined[feature_importance_combined >= 0.01]

significant_features_combined

phil_S_T    0.156512
phil_N_Q    0.114198
phil_R_K    0.107212
phil_D_E    0.099923
phil_H      0.094943
phil_A      0.081300
phil_G      0.078142
phil_L_I    0.071797
phil_F_W    0.046522
phil_Y      0.045027
phil_V      0.038576
phil_P      0.038414
phil_M_C    0.027434
dtype: float64

In [9]:
# Selecting hydrophobic face composition
x_columns = ['phob_S_T', 'phob_R_K', 'phob_L_I', 'phob_F_W', 'phob_M_C', 'phob_D_E', 'phob_N_Q', 
                       'phob_A', 'phob_G', 'phob_V', 'phob_Y', 'phob_H', 'phob_P']

# Prepare features and target variable for training the Random Forest Classifier
X_train = df[x_columns]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['NEES_binned'])

# Train a Random Forest Classifier with the combined predictors
rf_classifier_combined = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier_combined.fit(X_train, y_encoded)

# Get feature importances from the classifier
importances_combined = rf_classifier_combined.feature_importances_
feature_importance_combined = pd.Series(importances_combined, index=x_columns).sort_values(ascending=False)

# Select significant features based on a threshold
significant_features_combined = feature_importance_combined[feature_importance_combined >= 0.01]

significant_features_combined

phob_L_I    0.152346
phob_F_W    0.136498
phob_V      0.109968
phob_S_T    0.108366
phob_M_C    0.083550
phob_R_K    0.071281
phob_G      0.064753
phob_D_E    0.055836
phob_A      0.053663
phob_N_Q    0.046958
phob_H      0.043855
phob_P      0.038031
phob_Y      0.034895
dtype: float64

In [13]:
# Both faces
x_columns = ['phil_S_T', 'phil_R_K', 'phil_L_I', 'phil_F_W', 'phil_M_C', 'phil_D_E', 'phil_N_Q', 
                       'phil_A', 'phil_G', 'phil_V', 'phil_Y', 'phil_H', 'phil_P'] + ['phob_S_T', 'phob_R_K', 'phob_L_I', 'phob_F_W', 'phob_M_C', 'phob_D_E', 'phob_N_Q', 
                       'phob_A', 'phob_G', 'phob_V', 'phob_Y', 'phob_H', 'phob_P']

# Prepare features and target variable for training the Random Forest Classifier
X_train = df[x_columns]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['NEES_binned'])

# Train a Random Forest Classifier with the combined predictors
rf_classifier_combined = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier_combined.fit(X_train, y_encoded)

# Get feature importances from the classifier
importances_combined = rf_classifier_combined.feature_importances_
feature_importance_combined = pd.Series(importances_combined, index=x_columns).sort_values(ascending=False)

# Select significant features based on a threshold
significant_features_combined = feature_importance_combined[feature_importance_combined >= 0.01]
print(significant_features_combined)

significant_features_combined.to_csv('Output/RF_FeatureSignificance.csv', index=True)

phil_S_T    0.103356
phob_L_I    0.087421
phob_V      0.067793
phob_F_W    0.066133
phil_H      0.057881
phil_N_Q    0.053936
phil_R_K    0.050912
phob_S_T    0.049045
phil_G      0.041748
phil_A      0.040160
phil_D_E    0.039091
phob_M_C    0.034465
phil_Y      0.033398
phil_L_I    0.031741
phob_R_K    0.031367
phob_D_E    0.027215
phil_M_C    0.023773
phob_G      0.023555
phil_F_W    0.021939
phob_Y      0.018513
phob_A      0.018326
phob_P      0.017678
phob_N_Q    0.017135
phil_V      0.016096
phil_P      0.014476
phob_H      0.012846
dtype: float64
