# Combine Results
Author: Doug Klink (dklink@stanford.edu)

Herein we combine the results generated by KNN, Random Forest, and SVM.  We create a final output table which has the score each gave to each compound (or NaN if compound unscored by one method), and a "combined_score" column which is the weighted average of the three methods' scores, the weights being the inverse of the RMSE each method achieved in its regression validation.

In [1]:
import pandas as pd
import numpy as np

In [2]:
knn = pd.read_csv('../results/knn_results.csv', index_col=0).reset_index(drop=True)
knn.head(10)

Unnamed: 0,source,name,predicted_acvalue(log10),RMSE
0,reframe,NAFAMOSTAT,-4.57386,0.716535
1,broad,CTS21166,-3.261067,0.716535
2,drugbank,IPAMORELIN,-3.25877,0.716535
3,drugbank,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",-3.230251,0.716535
4,drugbank,L-N(OMEGA)-NITROARGININE-(4R)-AMINO-L-PROLINE ...,-3.226611,0.716535
5,drugbank,RWJ-56423,-3.215892,0.716535
6,drugbank,ANGIOTENSIN II,-3.199554,0.716535
7,drugbank,THYMOPENTIN,-3.121986,0.716535
8,drugbank,BIO-11006,-3.066794,0.716535
9,drugbank,PLECANATIDE,-3.064554,0.716535


In [3]:
rf = pd.read_csv('../results/random_forest_results.csv', index_col=0).reset_index(drop=True)
rf['predicted_activity(log10)'] = -rf['predicted_activity(log10)']
rf.head(10)

Unnamed: 0,name,smiles,predicted_activity(log10),RMSE
0,NAFAMOSTAT,NC(N)=Nc1ccc(cc1)C(=O)Oc1cc2ccc(cc2cc1)C(N)=N,-2.797827,0.590069
1,RWJ-51084,NC(N)=NCCC[C@H](NC(=O)C1CCCC1)C(=O)c1[n]c2cccc...,-2.49919,0.590069
2,RWJ-56423,CC(=O)N1C[C@H](O)C[C@H]1C(=O)N[C@@H](CCCN=C(N)...,-2.365654,0.590069
3,CAMOSTAT,CN(C)C(=O)COC(=O)Cc1ccc(cc1)OC(=O)c1ccc(cc1)N=...,-2.212964,0.590069
4,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",NC(=N)NCCC[C@H](N)C(=O)c1[n]c2ccccc2[s]1,-1.974841,0.590069
5,RWJ-58643,CC(=O)N1C[C@H](O)C[C@H]1C(=O)NC(CCCNC(N)=N)C(=...,-1.959521,0.590069
6,N-(1-ADAMANTYL)-N'-(4-GUANIDINOBENZYL)UREA,NC(N)=Nc1ccc(CNC(=O)NC23CC4CC(C2)CC(C3)C4)cc1,-1.32094,0.590069
7,CAMOSTAT-MESILATE,CN(C)C(=O)COC(=O)Cc1ccc(cc1)OC(=O)c1ccc(cc1)NC...,-1.237724,0.590069
8,TO-195,NC(=N)c1cc2ccc(cc2cc1)OC(=O)c1ccc(CNC(=O)CCCC(...,-1.235729,0.590069
9,GEPON,CC(C)[C@@H](NC(=O)C(NC(=O)[C@@H](CCC(O)=O)NC(=...,-1.23237,0.590069


In [4]:
svm = pd.read_csv('../results/svm_screening_results_no_duplicate_names.csv', index_col=0).reset_index(drop=True)
svm['name'] = svm.name.str.upper()
svm.head(10)

Unnamed: 0,source,name,pred_value,RMSE
0,reframe,NAFAMOSTAT,-4.573851,0.750939
1,broad,SUBSTANCE-P,-3.270026,0.750939
2,reframe,CAMOSTAT,-3.063672,0.750939
3,broad,"[SAR9,MET(O2)11]-SUBSTANCE-P",-2.878899,0.750939
4,drugbank,"SAR9, MET (O2)11-SUBSTANCE P",-2.878899,0.750939
5,reframe,RWJ-58643,-2.231559,0.750939
6,reframe,SEPIMOSTAT MESILATE,-2.119949,0.750939
7,drugbank,RWJ-56423,-2.05168,0.750939
8,broad,TELAPREVIR,-2.013527,0.750939
9,reframe,ATECEGATRAN,-1.981752,0.750939


In [5]:
knn_RMSE = knn['RMSE'][0]
rf_RMSE = rf['RMSE'][0]
svm_RMSE = svm['RMSE'][0]
print(f'Method          RMSE     Weight (1/RMSE)')
print(f'knn            {knn_RMSE: .2f}    {1/knn_RMSE: .2f}')
print(f'random forest  {rf_RMSE: .2f}    {1/rf_RMSE: .2f}')
print(f'svm            {svm_RMSE: .2f}    {1/svm_RMSE: .2f}')

Method          RMSE     Weight (1/RMSE)
knn             0.72     1.40
random forest   0.59     1.69
svm             0.75     1.33


In [6]:
knn.drop(columns=['source', 'RMSE'], inplace=True)
rf.drop(columns=['smiles', 'RMSE'], inplace=True)
svm.drop(columns=['source', 'RMSE'], inplace=True)

In [7]:
knn = knn.rename(columns={'predicted_acvalue(log10)': 'knn_acvalue'})
rf = rf.rename(columns={'predicted_activity(log10)': 'random_forest_acvalue'})
svm = svm.rename(columns={'pred_value': 'svm_acvalue'})

In [8]:
combined = rf.merge(svm, on='name', how='outer').merge(knn, on='name', how='outer')

In [9]:
combined.head(10)

Unnamed: 0,name,random_forest_acvalue,svm_acvalue,knn_acvalue
0,NAFAMOSTAT,-2.797827,-4.573851,-4.57386
1,RWJ-51084,-2.49919,-1.887822,-3.00353
2,RWJ-56423,-2.365654,-2.05168,-3.215892
3,CAMOSTAT,-2.212964,-3.063672,
4,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",-1.974841,,-3.230251
5,RWJ-58643,-1.959521,-2.231559,-2.847431
6,N-(1-ADAMANTYL)-N'-(4-GUANIDINOBENZYL)UREA,-1.32094,,
7,CAMOSTAT-MESILATE,-1.237724,-0.694657,
8,TO-195,-1.235729,-0.040993,
9,GEPON,-1.23237,,-0.923724


In [10]:
acvalue = np.array([combined.knn_acvalue, combined.svm_acvalue, combined.random_forest_acvalue]).T
weights = np.array([1/knn_RMSE, 1/svm_RMSE, 1/rf_RMSE]) * np.ones_like(acvalue)
weights[np.isnan(acvalue)] = 0
combined['combined_acvalue'] = np.nansum(acvalue*weights, axis=1) / np.sum(weights, axis=1)

In [11]:
combined.sort_values(by='combined_acvalue', inplace=True)
combined.reset_index(drop=True, inplace=True)
combined.to_csv('../results/combined_results.csv')

In [12]:
pd.set_option('display.max_rows', 50)
combined.head(50)

Unnamed: 0,name,random_forest_acvalue,svm_acvalue,knn_acvalue,combined_acvalue
0,NAFAMOSTAT,-2.797827,-4.573851,-4.57386,-3.893197
1,L-N(OMEGA)-NITROARGININE-(4R)-AMINO-L-PROLINE ...,,,-3.226611,-3.226611
2,INDIUM IN-111 PENTETREOTIDE,,,-2.905743,-2.905743
3,ETARFOLATIDE,,,-2.633206,-2.633206
4,CAMOSTAT,-2.212964,-3.063672,,-2.587292
5,"1-[(4S)-4-AMINO-5-(1,3-BENZOTHIAZOL-2-YL)-5-OX...",-1.974841,,-3.230251,-2.541791
6,RWJ-56423,-2.365654,-2.05168,-3.215892,-2.539442
7,RWJ-51084,-2.49919,-1.887822,-3.00353,-2.474252
8,"N,N-DIMETHYLARGININE",,,-2.327946,-2.327946
9,RWJ-58643,-1.959521,-2.231559,-2.847431,-2.321674
