In [14]:
import pandas as pd
import numpy as np
import ppscore as pps

path = r"C:\Users\Roman\Documents\CleanData\NormByWell"

df = pd.read_csv(r"C:\Users\Roman\Documents\CleanData\NormByWell\tubesProfiles.csv").drop(columns=['Unnamed: 0', 'Row','Column','PlateName','Nucleus Width [µm] (2)'])
df.rename(columns={'Compound':'y'}, inplace=True)

In [15]:
ppscores = pps.predictors(df, "y")
ppscores.head(20)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Cell 488 Threshold Compactness 50% SER-Hole,y,0.197921,classification,True,weighted F1,0.050452,0.238388,DecisionTreeClassifier()
1,Nucleus Width [µm],y,0.196459,classification,True,weighted F1,0.050452,0.236999,DecisionTreeClassifier()
2,Cell 488 Threshold Compactness 40% SER-Hole,y,0.181294,classification,True,weighted F1,0.050452,0.2226,DecisionTreeClassifier()
3,Cell Mito Threshold Compactness 50% SER-Hole,y,0.179839,classification,True,weighted F1,0.050452,0.221218,DecisionTreeClassifier()
4,Cell 488 Threshold Compactness 30% SER-Hole,y,0.175212,classification,True,weighted F1,0.050452,0.216824,DecisionTreeClassifier()
5,Cell Mito Threshold Compactness 60% SER-Hole,y,0.174113,classification,True,weighted F1,0.050452,0.215781,DecisionTreeClassifier()
6,Nucleus 33342 Threshold Compactness 40% SER-Hole,y,0.171281,classification,True,weighted F1,0.050452,0.213092,DecisionTreeClassifier()
7,Cell 568 Threshold Compactness 50% SER-Hole,y,0.16957,classification,True,weighted F1,0.050452,0.211467,DecisionTreeClassifier()
8,Nucleus 33342 Threshold Compactness 30% SER-Hole,y,0.168557,classification,True,weighted F1,0.050452,0.210505,DecisionTreeClassifier()
9,Cell 488 Threshold Compactness 60% SER-Valley,y,0.167489,classification,True,weighted F1,0.050452,0.209491,DecisionTreeClassifier()


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# filter out features with low ppscore but keep 'y' column
features_to_keep = ppscores.loc[ppscores['ppscore'] > 0.05, 'x'].append(pd.Series('y'))
df_filt = df[features_to_keep]

X = df_filt.drop(columns=['y','Concentration'])
y = df_filt['y']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42, stratify=y, test_size=0.2)

rf = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=5)
rf.fit(X_train, y_train)

y_pred = pd.Series(rf.predict(X_test))
y_pred_proba = rf.predict_proba(X_test)


print('Accuracy of random forest classifier on test set: {:.2f}'.format(rf.score(X_test, y_test)))
print('ROC AUC score of random forest classifier on test set: {:.2f}'.format(roc_auc_score(y_test, y_pred_proba, multi_class='ovr')))

conf_matrix = confusion_matrix(y_test, y_pred)


# feature importance
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': rf.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
feature_importance

  features_to_keep = ppscores.loc[ppscores['ppscore'] > 0.05, 'x'].append(pd.Series('y'))


Accuracy of random forest classifier on test set: 0.37
ROC AUC score of random forest classifier on test set: 0.88


Unnamed: 0,feature,importance
0,Cytoplasm 488 Profile 4/5,0.045289
1,Intensity Nucleus Alexa 555 Mean,0.043754
2,Nucleus Alexa 488 SER Hole 0 px,0.036987
3,Cytoplasm 488 Profile 5/5,0.032426
4,Intensity Ring Region Alexa 488 Mean,0.032341
...,...,...
124,Cell 568 Threshold Compactness 60% SER-Edge,0.001261
125,Cell Mito Threshold Compactness 50% SER-Saddle,0.001241
126,Nucleus 33342 Threshold Compactness 40% SER-Dark,0.001127
127,Cell Mito Threshold Compactness 40% SER-Saddle,0.001014


In [34]:
from sklearn.inspection import permutation_importance

result = permutation_importance(rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2)


{'importances_mean': array([ 1.49253731e-03,  2.46268657e-03,  1.86567164e-03,  2.42537313e-03,
         2.16417910e-03,  6.23134328e-03,  1.06343284e-02,  4.10447761e-03,
         2.98507463e-03, -7.46268657e-05,  5.33582090e-03,  5.41044776e-03,
         1.71641791e-03,  1.79104478e-03,  1.11940299e-03,  4.85074627e-04,
         4.21641791e-03,  1.34328358e-03,  5.22388060e-04,  1.26865672e-03,
         2.83582090e-03,  1.28358209e-02,  7.83582090e-03,  1.26865672e-03,
         1.60447761e-03,  1.41791045e-03,  1.97761194e-03,  1.75373134e-03,
         4.47761194e-04,  1.41791045e-03,  1.64179104e-03,  6.64179104e-03,
         7.42537313e-03,  7.01492537e-03,  1.23134328e-03,  1.41791045e-03,
         2.91044776e-03,  5.59701493e-03,  3.54477612e-03,  2.91044776e-03,
         2.42537313e-03,  1.60447761e-03,  2.94776119e-03,  3.61940299e-03,
         2.20149254e-03,  0.00000000e+00,  1.11940299e-03,  1.82835821e-03,
         5.97014925e-04,  8.95522388e-04,  4.10447761e-04,  1.641791

In [37]:
result.importances_mean
rf.


array([ 1.49253731e-03,  2.46268657e-03,  1.86567164e-03,  2.42537313e-03,
        2.16417910e-03,  6.23134328e-03,  1.06343284e-02,  4.10447761e-03,
        2.98507463e-03, -7.46268657e-05,  5.33582090e-03,  5.41044776e-03,
        1.71641791e-03,  1.79104478e-03,  1.11940299e-03,  4.85074627e-04,
        4.21641791e-03,  1.34328358e-03,  5.22388060e-04,  1.26865672e-03,
        2.83582090e-03,  1.28358209e-02,  7.83582090e-03,  1.26865672e-03,
        1.60447761e-03,  1.41791045e-03,  1.97761194e-03,  1.75373134e-03,
        4.47761194e-04,  1.41791045e-03,  1.64179104e-03,  6.64179104e-03,
        7.42537313e-03,  7.01492537e-03,  1.23134328e-03,  1.41791045e-03,
        2.91044776e-03,  5.59701493e-03,  3.54477612e-03,  2.91044776e-03,
        2.42537313e-03,  1.60447761e-03,  2.94776119e-03,  3.61940299e-03,
        2.20149254e-03,  0.00000000e+00,  1.11940299e-03,  1.82835821e-03,
        5.97014925e-04,  8.95522388e-04,  4.10447761e-04,  1.64179104e-03,
        2.38805970e-03,  