# Combine Results
Author: Doug Klink (dklink@stanford.edu)

Herein we combine the results generated by KNN, Random Forest, and SVM.  We create a final output table which has the score each gave to each compound (or NaN if compound unscored by one method), and a "combined_score" column which is the weighted average of the three methods' scores, the weights being the inverse of the RMSE each method achieved in its regression validation.

In [1]:
import pandas as pd
import numpy as np

In [2]:
knn = pd.read_csv('../results/knn_results.csv', index_col=0).reset_index(drop=True)
knn.head(10)

Unnamed: 0,source,name,predicted_acvalue(log10),RMSE
0,reframe,NAFAMOSTAT,-4.57386,0.716535
1,broad,CTS21166,-3.261067,0.716535
2,drugbank,IPAMORELIN,-3.25877,0.716535
3,drugbank,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",-3.230251,0.716535
4,drugbank,L-N(OMEGA)-NITROARGININE-(4R)-AMINO-L-PROLINE ...,-3.226611,0.716535
5,drugbank,RWJ-56423,-3.215892,0.716535
6,drugbank,ANGIOTENSIN II,-3.199554,0.716535
7,drugbank,THYMOPENTIN,-3.121986,0.716535
8,drugbank,BIO-11006,-3.066794,0.716535
9,drugbank,PLECANATIDE,-3.064554,0.716535


In [3]:
rf = pd.read_csv('../results/random_forest_results.csv', index_col=0).reset_index(drop=True)
rf['predicted_activity(log10)'] = -rf['predicted_activity(log10)']
rf.head(10)

Unnamed: 0,name,smiles,predicted_activity(log10),RMSE
0,NAFAMOSTAT,C1=CC(=CC=C1C(=O)OC2=CC3=C(C=C2)C=C(C=C3)C(=N)...,-2.797827,0.348182
1,RWJ-51084,[H]N([H])C(=NCCC[C@H](N([H])C(=O)C1CCCC1)C(=O)...,-2.49919,0.348182
2,RWJ-56423,[H]N([H])C(=NCCC[C@H](N([H])C(=O)[C@@H]1C[C@@H...,-2.365654,0.348182
3,PATAMOSTAT,C1CC(=O)N(C1=O)CCSC2=CC=C(C=C2)OC(=O)C3=CC=C(C...,-2.248543,0.348182
4,CAMOSTAT,CN(C)C(=O)COC(=O)CC1=CC=C(C=C1)OC(=O)C2=CC=C(C...,-2.212964,0.348182
5,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",N[C@@H](CCCNC(N)=N)C(=O)C1=NC2=CC=CC=C2S1,-1.974841,0.348182
6,RWJ-58643,CC(=O)N1C[C@H](O)C[C@H]1C(=O)NC(CCCNC(N)=N)C(=...,-1.959521,0.348182
7,PGL5001,N#CC(c1nc2ccccc2s1)c1ccnc(OCc2ccc(CN3CCOCC3)cc...,-1.639983,0.348182
8,BMS-605339,CC(C)(C)[C@@H](C(=O)N1C[C@@H](C[C@H]1C(=O)N[C@...,-1.638343,0.348182
9,SEPIMOSTAT MESILATE,[H]/N=C(/C1=CC=C2C=C(C=CC2=C1)OC(=O)C3=CC=C(C=...,-1.637788,0.348182


In [4]:
svm = pd.read_csv('../results/svm_screening_results_no_duplicate_names.csv', index_col=0).reset_index(drop=True)
svm['name'] = svm.name.str.upper()
svm.head(10)

Unnamed: 0,source,name,pred_value,RMSE
0,reframe,NAFAMOSTAT,-4.573851,0.750939
1,broad,SUBSTANCE-P,-3.270026,0.750939
2,reframe,CAMOSTAT,-3.063672,0.750939
3,broad,"[SAR9,MET(O2)11]-SUBSTANCE-P",-2.878899,0.750939
4,drugbank,"SAR9, MET (O2)11-SUBSTANCE P",-2.878899,0.750939
5,reframe,RWJ-58643,-2.231559,0.750939
6,reframe,SEPIMOSTAT MESILATE,-2.119949,0.750939
7,drugbank,RWJ-56423,-2.05168,0.750939
8,broad,TELAPREVIR,-2.013527,0.750939
9,reframe,ATECEGATRAN,-1.981752,0.750939


In [5]:
knn_RMSE = knn['RMSE'][0]
rf_RMSE = rf['RMSE'][0]
svm_RMSE = svm['RMSE'][0]
print(f'Method          RMSE     Weight (1/RMSE)')
print(f'knn            {knn_RMSE: .2f}    {1/knn_RMSE: .2f}')
print(f'random forest  {rf_RMSE: .2f}    {1/rf_RMSE: .2f}')
print(f'svm            {svm_RMSE: .2f}    {1/svm_RMSE: .2f}')

Method          RMSE     Weight (1/RMSE)
knn             0.72     1.40
random forest   0.35     2.87
svm             0.75     1.33


In [6]:
knn.drop(columns=['source', 'RMSE'], inplace=True)
rf.drop(columns=['smiles', 'RMSE'], inplace=True)
svm.drop(columns=['source', 'RMSE'], inplace=True)

In [7]:
knn = knn.rename(columns={'predicted_acvalue(log10)': 'knn_acvalue'})
rf = rf.rename(columns={'predicted_activity(log10)': 'random_forest_acvalue'})
svm = svm.rename(columns={'pred_value': 'svm_acvalue'})

In [8]:
combined = rf.merge(svm, on='name', how='outer').merge(knn, on='name', how='outer')

In [9]:
combined.head(10)

Unnamed: 0,name,random_forest_acvalue,svm_acvalue,knn_acvalue
0,NAFAMOSTAT,-2.797827,-4.573851,-4.57386
1,RWJ-51084,-2.49919,-1.887822,-3.00353
2,RWJ-56423,-2.365654,-2.05168,-3.215892
3,PATAMOSTAT,-2.248543,,
4,CAMOSTAT,-2.212964,-3.063672,
5,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",-1.974841,,-3.230251
6,RWJ-58643,-1.959521,-2.231559,-2.847431
7,PGL5001,-1.639983,,
8,BMS-605339,-1.638343,,
9,SEPIMOSTAT MESILATE,-1.637788,-2.119949,


In [10]:
acvalue = np.array([combined.knn_acvalue, combined.svm_acvalue, combined.random_forest_acvalue]).T
weights = np.array([1/knn_RMSE, 1/svm_RMSE, 1/rf_RMSE]) * np.ones_like(acvalue)
weights[np.isnan(acvalue)] = 0
combined['combined_acvalue'] = np.nansum(acvalue*weights, axis=1) / np.sum(weights, axis=1)

In [11]:
combined.sort_values(by='combined_acvalue', inplace=True)
combined.reset_index(drop=True, inplace=True)
combined.to_csv('../results/combined_results.csv')

In [12]:
pd.set_option('display.max_rows', 50)
combined.head(50)

Unnamed: 0,name,random_forest_acvalue,svm_acvalue,knn_acvalue,combined_acvalue
0,NAFAMOSTAT,-2.797827,-4.573851,-4.57386,-3.662878
1,RWJ-56423,-2.365654,-2.05168,-3.215892,-2.5029
2,CAMOSTAT,-2.212964,-3.063672,,-2.482453
3,RWJ-51084,-2.49919,-1.887822,-3.00353,-2.479495
4,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",-1.974841,,-3.230251,-2.385383
5,PATAMOSTAT,-2.248543,,,-2.248543
6,RWJ-58643,-1.959521,-2.231559,-2.847431,-2.245526
7,SEPIMOSTAT MESILATE,-1.637788,-2.119949,,-1.790528
8,SUBSTANCE-P,-1.214972,-3.270026,-1.560627,-1.789869
9,U-75799E,-1.156762,,-2.929955,-1.736628
