In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
# from sklearn.feature_selection import SelectFromModel

In [4]:
df = pd.read_csv('../Output/NEES_summary_with_heliquest_Median_Transformed.csv')

In [5]:
df.columns

Index(['AA_seq', 'Category', 'AH', 'Median', 'Mean', 'phil_A', 'phob_A',
       'phil_G', 'phob_G', 'phil_V', 'phob_V', 'phil_L', 'phob_L', 'phil_I',
       'phob_I', 'phil_F', 'phob_F', 'phil_W', 'phob_W', 'phil_M', 'phob_M',
       'phil_Y', 'phob_Y', 'phil_C', 'phob_C', 'phil_S', 'phob_S', 'phil_T',
       'phob_T', 'phil_R', 'phob_R', 'phil_K', 'phob_K', 'phil_N', 'phob_N',
       'phil_Q', 'phob_Q', 'phil_D', 'phob_D', 'phil_E', 'phob_E', 'phil_H',
       'phob_H', 'phil_P', 'phob_P', 'Length', 'Hydrophobicity',
       'Hydrophobic_density', 'H_moment', 'Netcharge', 'Dfactor', 'phil_S_T',
       'phil_R_K', 'phil_L_I', 'phil_F_W_Y', 'phil_M_C', 'phil_D_E',
       'phil_N_Q', 'phob_S_T', 'phob_R_K', 'phob_L_I', 'phob_F_W_Y',
       'phob_M_C', 'phob_D_E', 'phob_N_Q', 'STRK_ratio', 'STRK_ratio_Log1p',
       'Median_Log1p', 'Median_boxcox'],
      dtype='object')

In [12]:
# Define predictors and target
x_columns = [ 'Length', 'Hydrophobicity',
       'Hydrophobic_density', 'H_moment', 'Netcharge', 'Dfactor', 'STRK_ratio_Log1p',]

y_column = 'Median_boxcox'

df = df.dropna(subset=[y_column])

X_train = df[x_columns]
y = df[y_column]

In [13]:
# Train a Random Forest Regressor with the complete set of predictors and Box-Cox transformed target variable
rf_regressor_complete = RandomForestRegressor(n_estimators=100, random_state=0)
rf_regressor_complete.fit(X_train, y)

# Get feature importances from the regressor
importances_complete = rf_regressor_complete.feature_importances_
feature_importance_complete = pd.Series(importances_complete, index=x_columns).sort_values(ascending=False)

# Display the importances
feature_importance_complete

Hydrophobic_density    0.223205
Hydrophobicity         0.183410
Length                 0.163062
H_moment               0.149603
Dfactor                0.120742
STRK_ratio_Log1p       0.118277
Netcharge              0.041702
dtype: float64

In [7]:
feature_importance_complete.to_csv('./Output/RFReg_feature_importance.csv', index=True)

In [16]:
from sklearn.preprocessing import StandardScaler

# Scaling
AH_properties = x_columns
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(df[AH_properties]), columns=AH_properties)

# Combine the scaled additional predictors with the other predictors
x_columns_combined = AH_properties
X_combined_scaled = df[x_columns_combined].copy()
for col in AH_properties:
    X_combined_scaled[col] = scaled_data[col]

# Define data
y_column = 'Median_boxcox'
X_train = X_combined_scaled
y = df[y_column]

# Train a Random Forest Regressor with the combined set of predictors
rf_regressor_combined_scaled = RandomForestRegressor(n_estimators=100, random_state=0)
rf_regressor_combined_scaled.fit(X_train, y)

# Get feature importances from the regressor
importances_combined_scaled = rf_regressor_combined_scaled.feature_importances_
feature_importance_combined_scaled = pd.Series(importances_combined_scaled, index=X_combined_scaled.columns).sort_values(ascending=False)

# Display and save the importances
# feature_importance_combined_scaled.to_csv("./Output/RFReg_feature_importance_allproperties.csv", index=True)
feature_importance_combined_scaled

Hydrophobic_density    0.223205
Hydrophobicity         0.183410
Length                 0.163062
H_moment               0.149603
Dfactor                0.120742
STRK_ratio_Log1p       0.118277
Netcharge              0.041702
dtype: float64