In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
# from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_csv('./Output/NEES_summary_with_heliquest_Median_Transformed.csv')

In [3]:
df.columns

Index(['AA_seq', 'AH', 'Median', 'Mean', 'NEES_binned', 'phil_A', 'phob_A',
       'phil_G', 'phob_G', 'phil_V', 'phob_V', 'phil_L', 'phob_L', 'phil_I',
       'phob_I', 'phil_F', 'phob_F', 'phil_W', 'phob_W', 'phil_M', 'phob_M',
       'phil_Y', 'phob_Y', 'phil_C', 'phob_C', 'phil_S', 'phob_S', 'phil_T',
       'phob_T', 'phil_R', 'phob_R', 'phil_K', 'phob_K', 'phil_N', 'phob_N',
       'phil_Q', 'phob_Q', 'phil_D', 'phob_D', 'phil_E', 'phob_E', 'phil_H',
       'phob_H', 'phil_P', 'phob_P', 'Length', 'Hydrophobicity', 'H_moment',
       'Netcharge', 'Dfactor', 'whole_A', 'whole_G', 'whole_V', 'whole_L',
       'whole_I', 'whole_F', 'whole_W', 'whole_M', 'whole_Y', 'whole_C',
       'whole_S', 'whole_T', 'whole_R', 'whole_K', 'whole_N', 'whole_Q',
       'whole_D', 'whole_E', 'whole_H', 'whole_P', 'whole_S_T', 'whole_R_K',
       'whole_L_I', 'whole_F_W', 'whole_M_C', 'whole_D_E', 'whole_N_Q',
       'phil_S_T', 'phil_R_K', 'phil_L_I', 'phil_F_W', 'phil_M_C', 'phil_D_E',
       'phil_

In [8]:
# Whole AA compositions
# Define predictors and target
x_columns = ['whole_S_T', 'whole_R_K', 'whole_L_I', 'whole_F_W', 'whole_M_C', 'whole_D_E', 'whole_N_Q', 
                       'whole_A', 'whole_G', 'whole_V', 'whole_Y', 'whole_H', 'whole_P']

y_column = 'Median_boxcox'

X_train = df[x_columns]
y = df[y_column]

In [9]:
# Train a Random Forest Regressor with the complete set of predictors and Box-Cox transformed target variable
rf_regressor_complete = RandomForestRegressor(n_estimators=100, random_state=0)
rf_regressor_complete.fit(X_train, y)

# Get feature importances from the regressor
importances_complete = rf_regressor_complete.feature_importances_
feature_importance_complete = pd.Series(importances_complete, index=x_columns).sort_values(ascending=False)

# Display the importances
feature_importance_complete

whole_M_C    0.158557
whole_S_T    0.128616
whole_R_K    0.123751
whole_H      0.110747
whole_F_W    0.080525
whole_A      0.071069
whole_D_E    0.070232
whole_L_I    0.063009
whole_N_Q    0.062974
whole_G      0.043543
whole_V      0.040594
whole_P      0.023363
whole_Y      0.023020
dtype: float64

In [6]:
# AA compositions by faces
# Define predictors and target
x_columns = ['phil_S_T', 'phil_R_K', 'phil_L_I', 'phil_F_W', 'phil_M_C', 'phil_D_E', 'phil_N_Q', 
                       'phil_A', 'phil_G', 'phil_V', 'phil_Y', 'phil_H', 'phil_P'] + ['phob_S_T', 'phob_R_K', 'phob_L_I', 'phob_F_W', 'phob_M_C', 'phob_D_E', 'phob_N_Q', 
                       'phob_A', 'phob_G', 'phob_V', 'phob_Y', 'phob_H', 'phob_P']

y_column = 'Median_boxcox'

X_train = df[x_columns]
y = df[y_column]

In [7]:
# Train a Random Forest Regressor with the complete set of predictors and Box-Cox transformed target variable
rf_regressor_complete = RandomForestRegressor(n_estimators=100, random_state=0)
rf_regressor_complete.fit(X_train, y)

# Get feature importances from the regressor
importances_complete = rf_regressor_complete.feature_importances_
feature_importance_complete = pd.Series(importances_complete, index=x_columns).sort_values(ascending=False)

# Display the importances
feature_importance_complete

phil_S_T    0.152270
phob_F_W    0.094255
phil_R_K    0.074969
phob_L_I    0.069569
phil_N_Q    0.060904
phil_D_E    0.058722
phob_M_C    0.056581
phil_G      0.049410
phil_F_W    0.044180
phil_A      0.042935
phil_L_I    0.039271
phil_H      0.037708
phob_V      0.034185
phob_Y      0.032258
phob_S_T    0.028549
phob_G      0.017520
phil_Y      0.016024
phil_M_C    0.015815
phob_H      0.014430
phob_D_E    0.012443
phob_P      0.011509
phob_N_Q    0.010417
phob_A      0.008889
phil_P      0.007684
phob_R_K    0.005564
phil_V      0.003938
dtype: float64

In [10]:
feature_importance_complete.to_csv('./Output/RFReg_feature_importance_wholeAAs.csv', index=True)

In [11]:
from sklearn.preprocessing import StandardScaler

# Scaling
AH_properties = ['Length', 'Hydrophobicity', 'H_moment', 'Netcharge', 'Dfactor']
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(df[AH_properties]), columns=AH_properties)

# Combine the scaled additional predictors with the other predictors
x_columns_combined = x_columns + AH_properties
X_combined_scaled = df[x_columns_combined].copy()
for col in AH_properties:
    X_combined_scaled[col] = scaled_data[col]

# Define data
y_column = 'Median_boxcox'
X_train = X_combined_scaled
y = df[y_column]

# Train a Random Forest Regressor with the combined set of predictors
rf_regressor_combined_scaled = RandomForestRegressor(n_estimators=100, random_state=0)
rf_regressor_combined_scaled.fit(X_train, y)

# Get feature importances from the regressor
importances_combined_scaled = rf_regressor_combined_scaled.feature_importances_
feature_importance_combined_scaled = pd.Series(importances_combined_scaled, index=X_combined_scaled.columns).sort_values(ascending=False)

# Display and save the importances
feature_importance_combined_scaled.to_csv("./Output/RFReg_feature_importance_allproperties.csv", index=True)
feature_importance_combined_scaled

whole_M_C         0.125007
Hydrophobicity    0.119609
whole_S_T         0.099575
whole_H           0.093730
H_moment          0.074005
whole_R_K         0.073249
Length            0.073189
Dfactor           0.048700
whole_A           0.046670
whole_N_Q         0.041726
whole_F_W         0.041366
whole_L_I         0.034300
whole_D_E         0.029712
whole_V           0.027675
whole_G           0.018838
whole_Y           0.018012
whole_P           0.017393
Netcharge         0.017243
dtype: float64