In [None]:
import pandas as pd
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import numpy as np

#load data labels and RDkit fingerprints
#naively assign data for all proteins to one giant array
f1 = pd.read_pickle('../processed_data/TMPRSS11D_processed.pkl')
f2 = pd.read_pickle('../processed_data/ST14_processed.pkl')
f3 = pd.read_pickle('../processed_data/TMPRSS6_processed.pkl')
fp1 = np.load('../dumps/deprecated/TMPRSS11D_morgan_fingerprints.npz')
fp2 = np.load('../dumps/deprecated/ST14_morgan_fingerprints.npz')
fp3 = np.load('../dumps/deprecated/TMPRSS6_morgan_fingerprints.npz')
features=pd.concat([f1, f2, f3])

In [None]:
features.head(5) #inspect data
fps_merged = []
ac_merged = []
activity=[]
#function to merge npz arrays
def merge_keys(in_xarray, in_yarray, in_y2array, in_fp, in_data):
     for i in range(len(in_fp.keys())):
            acval=in_data.loc[in_data['cid']==int(list(in_fp.keys())[i]), 'acvalue'].iloc[0]
            active=in_data.loc[in_data['cid']==int(list(in_fp.keys())[i]), 'activity'].iloc[0]
            in_yarray.append(acval)
            in_xarray.append(in_fp[list(in_fp.keys())[i]])
            if active == 'Active':
                in_y2array.append(1)
            else:
                in_y2array.append(0)
     return in_xarray, in_yarray, in_y2array
            

In [None]:
fps_merged, ac_merged, activity=merge_keys(fps_merged, ac_merged, activity, fp1, features)
fps_merged, ac_merged, activity=merge_keys(fps_merged, ac_merged, activity, fp2, features)
fps_merged, ac_merged, activity=merge_keys(fps_merged, ac_merged, activity, fp3, features)

In [None]:
#UNUSED #NOT RUN 
def chars_to_ints(char_array):
    int_array=[int(s) for s in itertools.islice(char_array,1,len(char_array)-1,2)]
    int_array=np.asarray(int_array)
    return int_array
#NOT RUN

In [None]:
#split training and test set, keep random_state to be an integer for reproducibility
#train the forest (this can take a while)
#in the example, about 2/3 go to training and 1/3 go to test
X_train, X_test, y_train, y_test = train_test_split(fps_merged,activity, test_size=0.33, random_state=39)
rf=RandomForestClassifier(verbose=2, n_estimators=50, random_state=111)
rf.fit(np.asarray(X_train), np.asarray(y_train))

In [None]:
y_pred=rf.predict(X_test)
#print(y_pred)
#show probabilities; print(rf.predict_proba(X_test))
print(rf.score(X_test, y_test))

In [None]:
#get the index of the features (of RDkit fingerprint) that were important
importances = rf.feature_importances_
featurenums = np.array([str(x).zfill(2) for x in range(len(importances))])
indices = np.argsort(importances)[::-1][0:25]#get the 25 most important features
plt.title('Feature Importances (train set)')
plt.bar(range(len(indices)), importances[indices], align='center')
plt.ylabel('Relative Importance')
plt.xticks(range(len(indices)), featurenums[indices], rotation=90)
plt.show()
#output metrics
print(classification_report(y_test, y_pred))
