In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import scipy.stats


def PCC_score(y_pred,y_true):
    diff_pred,diff_true=y_pred-np.mean(y_pred),y_true-np.mean(y_true)
    return np.sum(diff_pred*diff_true)/np.sqrt(np.sum(diff_pred**2)*np.sum(diff_true**2))

## 引用 MHC_flurry 文献的标准化代码
def from_ic50_limit_1(ic50, max_ic50=50000.0):
    """
    Convert ic50s to regression targets in the range [0.0, 1.0].
    
    Parameters
    ----------
    ic50 : numpy.array of float
    Returns
    -------
    numpy.array of float
    """
    x = 1.0 - (np.log(np.maximum(ic50, 1)) / np.log(max_ic50))
    
    return np.minimum(
        1.0,
        np.maximum(0.0, x))

def to_ic50(x, max_ic50=50000.0):
    return max_ic50 ** (1.0 - x)

def from_ic50_without_limit(ic50):
    x = 1.0 - (np.log(ic50))/np.log(50000)
    return x


def nmol_to_mol(ic50_nm):
    return ic50_nm * 1e-9

def negative_log10_ic50_mol(ic50_mol):
    x = -np.log10(ic50_mol)
    return x

def negative_log10_ic50_from_nm(ic50_nm):
    return negative_log10_ic50_mol(nmol_to_mol(ic50_nm))

测试函数功能正常

In [2]:
from_ic50_without_limit(0.17)
to_ic50(from_ic50_without_limit(0.17))
to_ic50(from_ic50_limit_1(0.17))
negative_log10_ic50_mol(1)
negative_log10_ic50_from_nm(1e9)

-0.0

In [3]:
tools_pred = '/mnt/zt/Dopaap/calculate_webtools_score/compare_HPV16_dataset/no_large_data/hpv_predictions.csv'
noMS_pred = '/mnt/zt/Dopaap/calculate_webtools_score/compare_HPV16_dataset/no_large_data/predresults_noMS_3D_R1.csv'
withMS_pred = '/mnt/zt/Dopaap/calculate_webtools_score/compare_HPV16_dataset/no_large_data/predresults_withMS_R1.csv'


df_tools = pd.read_csv(tools_pred)
df_noMS = pd.read_csv(noMS_pred)
df_withMS = pd.read_csv(withMS_pred)


# 数据处理  
# N1:对以nmol为单位的ic50值进行限制[0, 1]范围的MHC_flurry标准化
# N2:对以nmol为单位的ic50值进行不限制范围的MHC_flurry标准化
# n_log10:把ic50值转化为mol为单位后, 取以10为底的负对数
df_tools['QM_N1'] = df_tools['affinity'].apply(from_ic50_limit_1)
df_tools['QM_N2'] = df_tools['affinity'].apply(from_ic50_without_limit)
df_tools['QM_n_log10'] = df_tools['affinity'].apply(negative_log10_ic50_from_nm)


df_noMS['QM_N1'] = df_noMS['affinity'].apply(from_ic50_limit_1)
df_noMS['QM_N2'] = df_noMS['affinity'].apply(from_ic50_without_limit)
df_noMS['QM_n_log10'] = df_noMS['affinity'].apply(negative_log10_ic50_from_nm)
df_noMS['pred_QM_n_log10'] = df_noMS['Pred_QM'].apply(negative_log10_ic50_from_nm)

df_withMS['QM_N1'] = df_withMS['affinity'].apply(from_ic50_limit_1)
df_withMS['QM_N2'] = df_withMS['affinity'].apply(from_ic50_without_limit)
df_withMS['QM_n_log10'] = df_withMS['affinity'].apply(negative_log10_ic50_from_nm)
df_withMS['pred_QM_n_log10'] = df_withMS['Pred_QM'].apply(negative_log10_ic50_from_nm)

# df_tools['NetMHCpan_N1'] = df_tools['NetMHCpan'].apply(from_ic50_limit_1)
# df_tools['NetMHCpan_N2'] = df_tools['NetMHCpan'].apply(from_ic50_without_limit)


# df_tools['pred_QM_n_log10'] = df_tools['pred_QM'].apply(negative_log10_ic50_from_nm)
# df_tools['NetMHCpan_n_log10'] = df_tools['NetMHCpan'].apply(negative_log10_ic50_from_nm)

# df_tools['QM_N1'] = df_tools['affinity'].apply(from_ic50_limit_1)
# df_tools['QM_N2'] = df_tools['affinity'].apply(from_ic50_without_limit)

# df_tools['SMM_N1'] = df_tools['SMM'].apply(from_ic50_limit_1)
# df_tools['SMM_N2'] = df_tools['SMM'].apply(from_ic50_without_limit)
# df_tools['SMMPMBEC_N1'] = df_tools['SMMPMBEC'].apply(from_ic50_limit_1)
# df_tools['SMMPMBEC_N2'] = df_tools['SMMPMBEC'].apply(from_ic50_without_limit)
# df_tools['ANN4_N1'] = df_tools['ANN4'].apply(from_ic50_limit_1)
# df_tools['ANN4_N2'] = df_tools['ANN4'].apply(from_ic50_without_limit)
# df_tools['NetMHCpan_N1'] = df_tools['NetMHCpan'].apply(from_ic50_limit_1)
# df_tools['NetMHCpan_N2'] = df_tools['NetMHCpan'].apply(from_ic50_without_limit)
df_tools['nmp3_N1'] = df_tools['nmp3'].apply(from_ic50_limit_1)
df_tools['nmp4_N1'] = df_tools['nmp4'].apply(from_ic50_limit_1)
df_tools['ANN4_N1'] = df_tools['ANN4'].apply(from_ic50_limit_1)
df_tools['MHCf1_2_0_N1'] = df_tools['MHCf1_2_0'].apply(from_ic50_limit_1)
df_tools['MHCf_MS_N1'] = df_tools['MHCf_MS'].apply(from_ic50_limit_1)
df_tools['MHCf_noMS_N1'] = df_tools['MHCf_noMS'].apply(from_ic50_limit_1)

df_tools['nmp3_N2'] = df_tools['nmp3'].apply(from_ic50_without_limit)
df_tools['nmp4_N2'] = df_tools['nmp4'].apply(from_ic50_without_limit)
df_tools['ANN4_N2'] = df_tools['ANN4'].apply(from_ic50_without_limit)
df_tools['MHCf1_2_0_N2'] = df_tools['MHCf1_2_0'].apply(from_ic50_without_limit)
df_tools['MHCf_MS_N2'] = df_tools['MHCf_MS'].apply(from_ic50_without_limit)
df_tools['MHCf_noMS_N2'] = df_tools['MHCf_noMS'].apply(from_ic50_without_limit)


# df_tools['QM_n_log10'] = df_tools['affinity'].apply(negative_log10_ic50_from_nm)
# df_tools['pred_QM_n_log10'] = df_tools['pred_QM'].apply(negative_log10_ic50_from_nm)
# df_tools['SMM_n_log10'] = df_tools['SMM'].apply(negative_log10_ic50_from_nm)
# df_tools['SMMPMBEC_n_log10'] = df_tools['SMMPMBEC'].apply(negative_log10_ic50_from_nm)
# df_tools['ANN4_n_log10'] = df_tools['ANN4'].apply(negative_log10_ic50_from_nm)
# df_tools['NetMHCpan_n_log10'] = df_tools['NetMHCpan'].apply(negative_log10_ic50_from_nm)
df_tools['nmp3_n_log10'] = df_tools['nmp3'].apply(negative_log10_ic50_from_nm)
df_tools['nmp4_n_log10'] = df_tools['nmp4'].apply(negative_log10_ic50_from_nm)
df_tools['ANN4_n_log10'] = df_tools['ANN4'].apply(negative_log10_ic50_from_nm)
df_tools['MHCf1_2_0_n_log10'] = df_tools['MHCf1_2_0'].apply(negative_log10_ic50_from_nm)
df_tools['MHCf_MS_n_log10'] = df_tools['MHCf_MS'].apply(negative_log10_ic50_from_nm)
df_tools['MHCf_noMS_n_log10'] = df_tools['MHCf_noMS'].apply(negative_log10_ic50_from_nm)



# # 保存文件
df_tools.to_csv(tools_pred, index=0)
df_noMS.to_csv(noMS_pred, index=0)
df_withMS.to_csv(withMS_pred, index=0)

In [4]:
def output_result(y_true, tool_name_list, tool_data_list):
    for i in range(len(tool_data_list)):
        MAE = mean_absolute_error(y_true, tool_data_list[i])
        MSE = mean_squared_error(y_true, tool_data_list[i])
        RMSE = np.sqrt(MSE)
        r2 = r2_score(y_true, tool_data_list[i])
        PCC = PCC_score(y_true, tool_data_list[i])
        tau = scipy.stats.kendalltau(y_true, tool_data_list[i])[0]
        print(f'{tool_name_list[i]}', 'MAE', MAE, 'MSE', MSE, 'RMSE', RMSE, 'r2:', r2, 'PCC', PCC, 'tau', tau)

实验测定值直接计算, 单位为nm

In [5]:
print('实验测定值直接计算, 单位为nm')

affinity_noMS = df_noMS['affinity'].to_list()
Dopaap_noMS = df_noMS['Pred_QM'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['affinity'].to_list()
Dopaap_withMS = df_withMS['Pred_QM'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['affinity'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3"].to_list()
nmp4 = df_tools["nmp4"].to_list()
ANN4 = df_tools["ANN4"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0"].to_list()
MHCf_MS = df_tools["MHCf_MS"].to_list()
MHCf_noMS = df_tools["MHCf_noMS"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

实验测定值直接计算, 单位为nm

未使用质谱数据训练
Dopaap_noMS MAE 12153.644188109372 MSE 303204796.2212834 RMSE 17412.776809609757 r2: -0.5575414945661841 PCC 0.26593996280630894 tau 0.19866231639753532

使用质谱数据训练
Dopaap_withMS MAE 12811.412053013522 MSE 330596105.67093617 RMSE 18182.30199042289 r2: -0.6982487049732413 PCC 0.2601250861004573 tau 0.19770682629862935

其他工具
nmp3 MAE 11614.663947467296 MSE 270268486.4950261 RMSE 16439.844479040126 r2: -0.38835001172733663 PCC 0.26069375537154976 tau 0.16267218933874336
nmp4 MAE 11759.240257744024 MSE 276065486.4343993 RMSE 16615.218519008387 r2: -0.41812878852142754 PCC 0.24123468214617325 tau 0.14818058950533597
ANN4 MAE 11969.998475806919 MSE 282500456.800719 RMSE 16807.749902967946 r2: -0.45118477406900426 PCC 0.19744960459578628 tau 0.11012023389891437
MHCf1_2_0 MAE 12955.937709213144 MSE 340929535.06332004 RMSE 18464.277268913615 r2: -0.7513307975403443 PCC 0.16733775495763173 tau 0.1607612091409314
MHCf_MS MAE 12602.137291291887 MSE 321348782.67264485 RMSE

1-log50000(x), 限制[0,1]范围计算, x单位为nm

In [6]:
print('1-log50000(x), 限制[0,1]范围计算, x单位为nm')

affinity_noMS = df_noMS['QM_N1'].to_list()
Dopaap_noMS = df_noMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['QM_N1'].to_list()
Dopaap_withMS = df_withMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['QM_N1'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3_N1"].to_list()
nmp4 = df_tools["nmp4_N1"].to_list()
ANN4 = df_tools["ANN4_N1"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0_N1"].to_list()
MHCf_MS = df_tools["MHCf_MS_N1"].to_list()
MHCf_noMS = df_tools["MHCf_noMS_N1"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

1-log50000(x), 限制[0,1]范围计算, x单位为nm

未使用质谱数据训练
Dopaap_noMS MAE 0.18891936934578787 MSE 0.05697519382123108 RMSE 0.23869477124820115 r2: -2.7962424117190166 PCC 0.2800231427499939 tau 0.19866231639753532

使用质谱数据训练
Dopaap_withMS MAE 0.2101635282763628 MSE 0.06583568880826238 RMSE 0.25658466206743996 r2: -3.3866148984565294 PCC 0.2830493440926677 tau 0.19770682629862935

其他工具
nmp3 MAE 0.15621647996074678 MSE 0.03832157377664481 RMSE 0.1957589685726935 r2: -1.5533565381309469 PCC 0.2161166916040525 tau 0.16267218933874336
nmp4 MAE 0.15850514779150907 MSE 0.0398522766095779 RMSE 0.1996303499209925 r2: -1.6553468715443223 PCC 0.20827025574421912 tau 0.14818058950533597
ANN4 MAE 0.15914689833448953 MSE 0.039736507471990264 RMSE 0.19934018027480124 r2: -1.6476332038829664 PCC 0.1563886824737687 tau 0.11012023389891437
MHCf1_2_0 MAE 0.20928370771708535 MSE 0.06296326171631288 RMSE 0.2509248128749185 r2: -3.1952258250776158 PCC 0.26163317565492306 tau 0.1607612091409314
MHCf_MS MAE 0.184923650152

1-log50000(x), 不限制[0,1]范围计算, x单位为nm

In [7]:
print('1-log50000(x), 不限制[0,1]范围计算, x单位为nm')

affinity_noMS = df_noMS['QM_N2'].to_list()
Dopaap_noMS = df_noMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['QM_N2'].to_list()
Dopaap_withMS = df_withMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['QM_N2'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3_N2"].to_list()
nmp4 = df_tools["nmp4_N2"].to_list()
ANN4 = df_tools["ANN4_N2"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0_N2"].to_list()
MHCf_MS = df_tools["MHCf_MS_N2"].to_list()
MHCf_noMS = df_tools["MHCf_noMS_N2"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

1-log50000(x), 不限制[0,1]范围计算, x单位为nm

未使用质谱数据训练
Dopaap_noMS MAE 0.18891936934578787 MSE 0.05697519382123108 RMSE 0.23869477124820115 r2: -2.7962424117190166 PCC 0.2800231427499939 tau 0.19866231639753532

使用质谱数据训练
Dopaap_withMS MAE 0.2101635282763628 MSE 0.06583568880826238 RMSE 0.25658466206743996 r2: -3.3866148984565294 PCC 0.2830493440926677 tau 0.19770682629862935

其他工具
nmp3 MAE 0.15621647996074678 MSE 0.03832157377664481 RMSE 0.1957589685726935 r2: -1.5533565381309469 PCC 0.2161166916040525 tau 0.16267218933874336
nmp4 MAE 0.15850514779150907 MSE 0.0398522766095779 RMSE 0.1996303499209925 r2: -1.6553468715443223 PCC 0.20827025574421912 tau 0.14818058950533597
ANN4 MAE 0.15914689833448953 MSE 0.039736507471990264 RMSE 0.19934018027480124 r2: -1.6476332038829664 PCC 0.1563886824737687 tau 0.11012023389891437
MHCf1_2_0 MAE 0.20928370771708535 MSE 0.06296326171631288 RMSE 0.2509248128749185 r2: -3.1952258250776158 PCC 0.26163317565492306 tau 0.1607612091409314
MHCf_MS MAE 0.18492365015

-log10(x)计算, x单位为mol

In [8]:
print('-log10(x)计算, x单位为mol')

affinity_noMS = df_noMS['QM_n_log10'].to_list()
Dopaap_noMS = df_noMS['pred_QM_n_log10'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['QM_n_log10'].to_list()
Dopaap_withMS = df_withMS['pred_QM_n_log10'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['QM_n_log10'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3_n_log10"].to_list()
nmp4 = df_tools["nmp4_n_log10"].to_list()
ANN4 = df_tools["ANN4_n_log10"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0_n_log10"].to_list()
MHCf_MS = df_tools["MHCf_MS_n_log10"].to_list()
MHCf_noMS = df_tools["MHCf_noMS_n_log10"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

-log10(x)计算, x单位为mol

未使用质谱数据训练
Dopaap_noMS MAE 0.8877264496640778 MSE 1.2580304600122076 RMSE 1.121619570091485 r2: -2.796242410394554 PCC 0.28002314291556457 tau 0.19866231639753532

使用质谱数据训练
Dopaap_withMS MAE 0.9875521155128615 MSE 1.4536730174291486 RMSE 1.205683630737827 r2: -3.3866148992586433 PCC 0.2830493439586296 tau 0.19770682629862935

其他工具
nmp3 MAE 0.734056553518508 MSE 0.8461525774657265 RMSE 0.9198655214028443 r2: -1.5533565381309478 PCC 0.21611669160405259 tau 0.16267218933874336
nmp4 MAE 0.7448109350051487 MSE 0.8799509844666883 RMSE 0.938057026233847 r2: -1.6553468715443231 PCC 0.2082702557442192 tau 0.14818058950533597
ANN4 MAE 0.7478265015568804 MSE 0.8773947649666303 RMSE 0.9366935277702255 r2: -1.6476332038829664 PCC 0.1563886824737687 tau 0.11012023389891437
MHCf1_2_0 MAE 0.9834178649588107 MSE 1.3902489103768692 RMSE 1.1790881690428707 r2: -3.1952258250776167 PCC 0.261633175654923 tau 0.1607612091409314
MHCf_MS MAE 0.8689506851568033 MSE 1.1220863202191662 RMSE 1