In [None]:
import pandas as pd
import itertools
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import numpy as np

#load data labels and RDkit fingerprints
#naively assign data for all proteins to one giant array
f_truth = pd.read_pickle('../processed_data/TMPRSS2_processed.pkl')
f_tmpss11d = pd.read_pickle('../processed_data/TMPRSS11D_processed.pkl')
f_st14 = pd.read_pickle('../processed_data/ST14_processed.pkl')
f_tmprss6 = pd.read_pickle('../processed_data/TMPRSS6_processed.pkl')
f_klkb1 = pd.read_pickle('../processed_data/KLKB1_processed.pkl')
fp_dark=np.load('../dumps/deprecated/DarkChemicalMatter_morgan_fingerprints.npz')
fp_dark=fp_dark['fps']
fp_tmprss11d = np.load('../dumps/deprecated/TMPRSS11D_morgan_fingerprints.npz')
fp_st14 = np.load('../dumps/deprecated/ST14_morgan_fingerprints.npz')
fp_tmprss6= np.load('../dumps/deprecated/TMPRSS6_morgan_fingerprints.npz')
fp_klkb1=np.load('../dumps/deprecated/KLKB1_morgan_fingerprints.npz')
fp_truth=np.load('../dumps/deprecated/TMPRSS2_morgan_fingerprints.npz')
f_truth= f_truth.assign(cid= [int(lkey) for lkey in list(fp_truth.keys())]) #reassign 0 cids
#TODO: use combined dataset for training with xcorr vals
#method 1: try to concat all datasets together
features=pd.concat([f_tmprss6, f_klkb1, f_st14, f_tmpss11d, f_truth]) #concat datasets in order of closeness to tmprss2, from Doug's analysis
features.drop_duplicates(subset='cid', keep='last') #prioritize activity from tmprss2 dataset

In [None]:
print(list(fp_dark.keys()))

In [None]:
features.head(5) #inspect data
fps_merged = []
ac_merged = []
activity=[]
cids=[]
datalabels=[]
#function to merge npz arrays
def merge_keys(in_xarray, in_yarray, in_y2array, in_fp, in_data, in_cids,datalabel):
     for i in range(len(in_fp.keys())):
            if not int(list(in_fp.keys())[i]) in in_cids:
                in_cids.append(int(list(in_fp.keys())[i]))
                acval=in_data.loc[in_data['cid']==int(list(in_fp.keys())[i]), 'acvalue'].iloc[0]
                active=in_data.loc[in_data['cid']==int(list(in_fp.keys())[i]), 'activity'].iloc[0]
                in_yarray.append(acval)
                in_xarray.append(in_fp[list(in_fp.keys())[i]])
                if active == 'Active':
                    in_y2array.append(1)
                else:
                    in_y2array.append(0)
                datalabels.append(datalabel)
     return in_xarray, in_yarray, in_y2array, in_cids, datalabels
            

In [None]:
#merge in order of similarity
fps_merged, ac_merged, activity,cids,datalabels=merge_keys(fps_merged, ac_merged, activity, fp_truth, features, cids,datalabel='tmprss2')
fps_merged, ac_merged, activity,cids,datalabels=merge_keys(fps_merged, ac_merged, activity, fp_tmprss11d, features,cids,datalabel='tmprss11d')
fps_merged, ac_merged, activity,cids,datalabels=merge_keys(fps_merged, ac_merged, activity, fp_st14, features,cids,datalabel='st14')
fps_merged, ac_merged, activity,cids,datalabels=merge_keys(fps_merged, ac_merged, activity, fp_klkb1, features,cids,datalabel='klkb1')
fps_merged, ac_merged, activity,cids,datalabels=merge_keys(fps_merged, ac_merged, activity, fp_tmprss6, features,cids, datalabel='tmprss6')

In [None]:
#convert to array
fps_merged=np.array(fps_merged)
activity=np.array(activity)
ac_merged=np.array(ac_merged)
ac_merged=-np.log10(ac_merged)

In [None]:
#split train and test 
#half of tmprss2 active compounds to each set
import random
s=np.arange(len(list(fp_truth.keys())))
random.shuffle(s)
cut=41
test=s[0:cut]
train=s[cut::]

#add dark data. here we generate 1 test set that includes only data from TMPRSS2 dataset + negative examples
#and training data culled from all protein datasets, + negative examples that are not in the test set.
#the samples from TMPRSS2, and negative examples are sampled randomly to be about 50 percent.
s=np.arange(len(fp_dark))
random.shuffle(s)
cut=round(len(fp_dark)/2)
test_dark=s[0:cut]
train_dark=s[cut::]
X_test=np.concatenate((fps_merged[test], fp_dark[test_dark]))
X_train=np.concatenate((fps_merged[train], fps_merged[92::], fp_dark[train_dark]))
y_train=np.concatenate((activity[train], activity[92::], np.zeros([len(train_dark)])))
y2_train=np.concatenate((ac_merged[train], ac_merged[92::], np.zeros([len(train_dark)])))
y_test=np.concatenate((activity[test], np.zeros([len(test_dark)])))
y2_test=np.concatenate((ac_merged[test],np.zeros([len(test_dark)])))

In [None]:
#split training and test set, keep random_state to be an integer for reproducibility
#train the forest (this can take a while)
#in the example, about 2/3 go to training and 1/3 go to test
#X_train, X_test, y_train, y_test = train_test_split(fps_merged,activity, test_size=0.33, random_state=39)
rf=RandomForestClassifier(verbose=2, n_estimators=100, random_state=111)
#rg=RandomForestRegressor(verbose=2, n_estimators=100, random_state=111)
rf.fit(np.asarray(X_train), np.asarray(y_train))

In [None]:
y_pred=rf.predict(X_test)
#print(y_pred)
#show probabilities; print(rf.predict_proba(X_test))
print(rf.score(X_test, y_test))

In [None]:
#get the index of the features (of RDkit fingerprint) that were important
importances = rf.feature_importances_
featurenums = np.array([str(x).zfill(2) for x in range(len(importances))])
indices = np.argsort(importances)[::-1][0:25]#get the 25 most important features
plt.title('Feature Importances (train set)')
plt.bar(range(len(indices)), importances[indices], align='center')
plt.ylabel('Relative Importance')
plt.xticks(range(len(indices)), featurenums[indices], rotation=90)
plt.show()
#output metrics
print(classification_report(y_test, y_pred))


In [None]:
nan_array=np.isnan(y2_train)
X_train2=X_train[~nan_array]
y2_train2=y2_train[~nan_array]

In [None]:
#now train the regressor, which is to predict the activity value itself
#takes about 1 hour to run
rg.fit(np.asarray(X_train2), np.asarray(y2_train2))

In [None]:
X_test_reg=fps_merged[test]
X_train_reg=np.concatenate((fps_merged[train], fps_merged[92::]))
y2_train_reg=np.concatenate((ac_merged[train], ac_merged[92::]))
y2_test_reg=ac_merged[test]
nan_array=np.isnan(y2_train_reg)
X_train_reg=X_train_reg[~nan_array]
y2_train_reg=y2_train_reg[~nan_array]
rg=RandomForestRegressor(verbose=2, n_estimators=50, random_state=111)
rg.fit(np.asarray(X_train_reg), np.asarray(y2_train_reg))

In [None]:
y_pred_regr=rg.predict(X_test)
from sklearn.metrics import r2_score
print(r2_score(y2_test, y_pred_regr))

In [None]:
y_pred_reg_noneg=rg.predict(X_test_reg)
print(r2_score(y2_test_reg, y_pred_reg_noneg))

In [None]:
importances = rg.feature_importances_
featurenums = np.array([str(x).zfill(2) for x in range(len(importances))])
indices = np.argsort(importances)[::-1][0:25]#get the 25 most important features
plt.title('Feature Importances (train set)')
plt.bar(range(len(indices)), importances[indices], align='center')
plt.ylabel('Relative Importance')
plt.xticks(range(len(indices)), featurenums[indices], rotation=90)
plt.show()

In [None]:
# Dataframe with predictions and dates
plt.plot(y2_test_reg, 'b.', label = 'actual')
# Plot the predicted values
plt.plot(y_pred_reg_noneg, 'ro', label = 'prediction')
plt.xticks(rotation = '60'); 

# Graph labels
plt.ylabel('activity value'); plt.title('Actual and Predicted Values');