In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import scipy.stats


def PCC_score(y_pred,y_true):
    diff_pred,diff_true=y_pred-np.mean(y_pred),y_true-np.mean(y_true)
    return np.sum(diff_pred*diff_true)/np.sqrt(np.sum(diff_pred**2)*np.sum(diff_true**2))

## 引用 MHC_flurry 文献的标准化代码
def from_ic50_limit_1(ic50, max_ic50=50000.0):
    """
    Convert ic50s to regression targets in the range [0.0, 1.0].
    
    Parameters
    ----------
    ic50 : numpy.array of float
    Returns
    -------
    numpy.array of float
    """
    x = 1.0 - (np.log(np.maximum(ic50, 1)) / np.log(max_ic50))
    
    return np.minimum(
        1.0,
        np.maximum(0.0, x))

def to_ic50(x, max_ic50=50000.0):
    return max_ic50 ** (1.0 - x)

def from_ic50_without_limit(ic50):
    x = 1.0 - (np.log(ic50))/np.log(50000)
    return x


def nmol_to_mol(ic50_nm):
    return ic50_nm * 1e-9

def negative_log10_ic50_mol(ic50_mol):
    x = -np.log10(ic50_mol)
    return x

def negative_log10_ic50_from_nm(ic50_nm):
    return negative_log10_ic50_mol(nmol_to_mol(ic50_nm))

测试函数功能正常

In [18]:
from_ic50_without_limit(0.17)
to_ic50(from_ic50_without_limit(0.17))
to_ic50(from_ic50_limit_1(0.17))
negative_log10_ic50_mol(1)
negative_log10_ic50_from_nm(1e9)

-0.0

In [19]:
tools_pred = '/mnt/zt/Dopaap/calculate_webtools_score/compare_HPV16_dataset/hpv_predictions.csv'
noMS_pred = '/mnt/zt/Dopaap/calculate_webtools_score/compare_HPV16_dataset/predresults_noMS_3D_R1.csv'
withMS_pred = '/mnt/zt/Dopaap/calculate_webtools_score/compare_HPV16_dataset/predresults_withMS_R1.csv'


df_tools = pd.read_csv(tools_pred)
df_noMS = pd.read_csv(noMS_pred)
df_withMS = pd.read_csv(withMS_pred)


# 数据处理  
# N1:对以nmol为单位的ic50值进行限制[0, 1]范围的MHC_flurry标准化
# N2:对以nmol为单位的ic50值进行不限制范围的MHC_flurry标准化
# n_log10:把ic50值转化为mol为单位后, 取以10为底的负对数
df_tools['QM_N1'] = df_tools['affinity'].apply(from_ic50_limit_1)
df_tools['QM_N2'] = df_tools['affinity'].apply(from_ic50_without_limit)
df_tools['QM_n_log10'] = df_tools['affinity'].apply(negative_log10_ic50_from_nm)


df_noMS['QM_N1'] = df_noMS['affinity'].apply(from_ic50_limit_1)
df_noMS['QM_N2'] = df_noMS['affinity'].apply(from_ic50_without_limit)
df_noMS['QM_n_log10'] = df_noMS['affinity'].apply(negative_log10_ic50_from_nm)
df_noMS['pred_QM_n_log10'] = df_noMS['Pred_QM'].apply(negative_log10_ic50_from_nm)

df_withMS['QM_N1'] = df_withMS['affinity'].apply(from_ic50_limit_1)
df_withMS['QM_N2'] = df_withMS['affinity'].apply(from_ic50_without_limit)
df_withMS['QM_n_log10'] = df_withMS['affinity'].apply(negative_log10_ic50_from_nm)
df_withMS['pred_QM_n_log10'] = df_withMS['Pred_QM'].apply(negative_log10_ic50_from_nm)

# df_tools['NetMHCpan_N1'] = df_tools['NetMHCpan'].apply(from_ic50_limit_1)
# df_tools['NetMHCpan_N2'] = df_tools['NetMHCpan'].apply(from_ic50_without_limit)


# df_tools['pred_QM_n_log10'] = df_tools['pred_QM'].apply(negative_log10_ic50_from_nm)
# df_tools['NetMHCpan_n_log10'] = df_tools['NetMHCpan'].apply(negative_log10_ic50_from_nm)

# df_tools['QM_N1'] = df_tools['affinity'].apply(from_ic50_limit_1)
# df_tools['QM_N2'] = df_tools['affinity'].apply(from_ic50_without_limit)

# df_tools['SMM_N1'] = df_tools['SMM'].apply(from_ic50_limit_1)
# df_tools['SMM_N2'] = df_tools['SMM'].apply(from_ic50_without_limit)
# df_tools['SMMPMBEC_N1'] = df_tools['SMMPMBEC'].apply(from_ic50_limit_1)
# df_tools['SMMPMBEC_N2'] = df_tools['SMMPMBEC'].apply(from_ic50_without_limit)
# df_tools['ANN4_N1'] = df_tools['ANN4'].apply(from_ic50_limit_1)
# df_tools['ANN4_N2'] = df_tools['ANN4'].apply(from_ic50_without_limit)
# df_tools['NetMHCpan_N1'] = df_tools['NetMHCpan'].apply(from_ic50_limit_1)
# df_tools['NetMHCpan_N2'] = df_tools['NetMHCpan'].apply(from_ic50_without_limit)
df_tools['nmp3_N1'] = df_tools['nmp3'].apply(from_ic50_limit_1)
df_tools['nmp4_N1'] = df_tools['nmp4'].apply(from_ic50_limit_1)
df_tools['ANN4_N1'] = df_tools['ANN4'].apply(from_ic50_limit_1)
df_tools['MHCf1_2_0_N1'] = df_tools['MHCf1_2_0'].apply(from_ic50_limit_1)
df_tools['MHCf_MS_N1'] = df_tools['MHCf_MS'].apply(from_ic50_limit_1)
df_tools['MHCf_noMS_N1'] = df_tools['MHCf_noMS'].apply(from_ic50_limit_1)

df_tools['nmp3_N2'] = df_tools['nmp3'].apply(from_ic50_without_limit)
df_tools['nmp4_N2'] = df_tools['nmp4'].apply(from_ic50_without_limit)
df_tools['ANN4_N2'] = df_tools['ANN4'].apply(from_ic50_without_limit)
df_tools['MHCf1_2_0_N2'] = df_tools['MHCf1_2_0'].apply(from_ic50_without_limit)
df_tools['MHCf_MS_N2'] = df_tools['MHCf_MS'].apply(from_ic50_without_limit)
df_tools['MHCf_noMS_N2'] = df_tools['MHCf_noMS'].apply(from_ic50_without_limit)


# df_tools['QM_n_log10'] = df_tools['affinity'].apply(negative_log10_ic50_from_nm)
# df_tools['pred_QM_n_log10'] = df_tools['pred_QM'].apply(negative_log10_ic50_from_nm)
# df_tools['SMM_n_log10'] = df_tools['SMM'].apply(negative_log10_ic50_from_nm)
# df_tools['SMMPMBEC_n_log10'] = df_tools['SMMPMBEC'].apply(negative_log10_ic50_from_nm)
# df_tools['ANN4_n_log10'] = df_tools['ANN4'].apply(negative_log10_ic50_from_nm)
# df_tools['NetMHCpan_n_log10'] = df_tools['NetMHCpan'].apply(negative_log10_ic50_from_nm)
df_tools['nmp3_n_log10'] = df_tools['nmp3'].apply(negative_log10_ic50_from_nm)
df_tools['nmp4_n_log10'] = df_tools['nmp4'].apply(negative_log10_ic50_from_nm)
df_tools['ANN4_n_log10'] = df_tools['ANN4'].apply(negative_log10_ic50_from_nm)
df_tools['MHCf1_2_0_n_log10'] = df_tools['MHCf1_2_0'].apply(negative_log10_ic50_from_nm)
df_tools['MHCf_MS_n_log10'] = df_tools['MHCf_MS'].apply(negative_log10_ic50_from_nm)
df_tools['MHCf_noMS_n_log10'] = df_tools['MHCf_noMS'].apply(negative_log10_ic50_from_nm)



# # 保存文件
df_tools.to_csv(tools_pred, index=0)
df_noMS.to_csv(noMS_pred, index=0)
df_withMS.to_csv(withMS_pred, index=0)

In [20]:
def output_result(y_true, tool_name_list, tool_data_list):
    for i in range(len(tool_data_list)):
        MAE = mean_absolute_error(y_true, tool_data_list[i])
        MSE = mean_squared_error(y_true, tool_data_list[i])
        RMSE = np.sqrt(MSE)
        r2 = r2_score(y_true, tool_data_list[i])
        PCC = PCC_score(y_true, tool_data_list[i])
        tau = scipy.stats.kendalltau(y_true, tool_data_list[i])[0]
        print(f'{tool_name_list[i]}', 'MAE', MAE, 'MSE', MSE, 'RMSE', RMSE, 'r2:', r2, 'PCC', PCC, 'tau', tau)

实验测定值直接计算, 单位为nm

In [21]:
print('实验测定值直接计算, 单位为nm')

affinity_noMS = df_noMS['affinity'].to_list()
Dopaap_noMS = df_noMS['Pred_QM'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['affinity'].to_list()
Dopaap_withMS = df_withMS['Pred_QM'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['affinity'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3"].to_list()
nmp4 = df_tools["nmp4"].to_list()
ANN4 = df_tools["ANN4"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0"].to_list()
MHCf_MS = df_tools["MHCf_MS"].to_list()
MHCf_noMS = df_tools["MHCf_noMS"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

实验测定值直接计算, 单位为nm

未使用质谱数据训练
Dopaap_noMS MAE 64131.14091961152 MSE 5589551374.160033 RMSE 74763.30232246322 r2: -2.389433865361325 PCC 0.34135566773428166 tau 0.2956295253001475

使用质谱数据训练
Dopaap_withMS MAE 64347.053125968734 MSE 5597317918.331108 RMSE 74815.22517730671 r2: -2.394143400361211 PCC 0.4393067431479667 tau 0.3944672649613143

其他工具
nmp3 MAE 58551.684773994326 MSE 4709568272.320985 RMSE 68626.29432164456 r2: -1.8558231465999842 PCC 0.4131375906131377 tau 0.3571028833241607
nmp4 MAE 58604.22842311851 MSE 4705048620.3658495 RMSE 68593.35696965012 r2: -1.853082486326746 PCC 0.4255161042627351 tau 0.36897610101844114
ANN4 MAE 59342.831510849064 MSE 4810206169.10274 RMSE 69355.64987153347 r2: -1.9168487053000471 PCC 0.4124206773269609 tau 0.348152652428663
MHCf1_2_0 MAE 64317.10206705299 MSE 5588415431.230289 RMSE 74755.7050079142 r2: -2.3887450438124276 PCC 0.4255571177597114 tau 0.38361369382474875
MHCf_MS MAE 62893.28412460385 MSE 5356592063.853003 RMSE 73188.7427399392 r2: -2.2

1-log50000(x), 限制[0,1]范围计算, x单位为nm

In [22]:
print('1-log50000(x), 限制[0,1]范围计算, x单位为nm')

affinity_noMS = df_noMS['QM_N1'].to_list()
Dopaap_noMS = df_noMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['QM_N1'].to_list()
Dopaap_withMS = df_withMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['QM_N1'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3_N1"].to_list()
nmp4 = df_tools["nmp4_N1"].to_list()
ANN4 = df_tools["ANN4_N1"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0_N1"].to_list()
MHCf_MS = df_tools["MHCf_MS_N1"].to_list()
MHCf_noMS = df_tools["MHCf_noMS_N1"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

1-log50000(x), 限制[0,1]范围计算, x单位为nm

未使用质谱数据训练
Dopaap_noMS MAE 0.2056814853178532 MSE 0.05672799978868849 RMSE 0.23817640476900412 r2: -3.9542255486170497 PCC 0.4313593092624816 tau 0.3000276979667089

使用质谱数据训练
Dopaap_withMS MAE 0.20958786034303514 MSE 0.05713363666118648 RMSE 0.23902643506772736 r2: -3.989651027475401 PCC 0.5134130865930758 tau 0.39410226453890007

其他工具
nmp3 MAE 0.14632891907220585 MSE 0.03208788696548641 RMSE 0.17913092129916156 r2: -1.802330947639852 PCC 0.44786375771150416 tau 0.36023066332554987
nmp4 MAE 0.14526349766954708 MSE 0.03154090487534922 RMSE 0.17759759253815693 r2: -1.7545613690245676 PCC 0.46180261391142113 tau 0.3708868974151432
ANN4 MAE 0.14959600572198928 MSE 0.03262362872012489 RMSE 0.18062012268882138 r2: -1.8491188742048243 PCC 0.4219795267641099 tau 0.35278557120491877
MHCf1_2_0 MAE 0.21104611751636945 MSE 0.057658049668864286 RMSE 0.24012090635524488 r2: -4.035449580753185 PCC 0.48915775555649377 tau 0.3834698166861139
MHCf_MS MAE 0.185898769223

1-log50000(x), 不限制[0,1]范围计算, x单位为nm

In [23]:
print('1-log50000(x), 不限制[0,1]范围计算, x单位为nm')

affinity_noMS = df_noMS['QM_N2'].to_list()
Dopaap_noMS = df_noMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['QM_N2'].to_list()
Dopaap_withMS = df_withMS['P_N_QM'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['QM_N2'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3_N2"].to_list()
nmp4 = df_tools["nmp4_N2"].to_list()
ANN4 = df_tools["ANN4_N2"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0_N2"].to_list()
MHCf_MS = df_tools["MHCf_MS_N2"].to_list()
MHCf_noMS = df_tools["MHCf_noMS_N2"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

1-log50000(x), 不限制[0,1]范围计算, x单位为nm

未使用质谱数据训练
Dopaap_noMS MAE 0.24665963065407487 MSE 0.07680004091292754 RMSE 0.27712820302691593 r2: -3.511057244826551 PCC 0.4465653542200587 tau 0.2956295253001475

使用质谱数据训练
Dopaap_withMS MAE 0.25056600567925674 MSE 0.07675777165134869 RMSE 0.2770519295210713 r2: -3.5085744459059764 PCC 0.541064905782849 tau 0.3944672649613143

其他工具
nmp3 MAE 0.18730706440842745 MSE 0.0461990277447484 RMSE 0.2149395909197475 r2: -1.7136243201767574 PCC 0.474998116384524 tau 0.3571028833241607
nmp4 MAE 0.1862416430057687 MSE 0.04540740588924646 RMSE 0.21309013559816994 r2: -1.6671262784573035 PCC 0.49171693463190763 tau 0.36897610101844114
ANN4 MAE 0.19057415105821093 MSE 0.04702808761297453 RMSE 0.2168596034603368 r2: -1.7623213844035406 PCC 0.45298101914292044 tau 0.348152652428663
MHCf1_2_0 MAE 0.2520242628525911 MSE 0.07748319399930206 RMSE 0.2783580320366238 r2: -3.5511840812576594 PCC 0.5186518812929107 tau 0.38361369382474875
MHCf_MS MAE 0.22687691455929002 MSE

-log10(x)计算, x单位为mol

In [24]:
print('-log10(x)计算, x单位为mol')

affinity_noMS = df_noMS['QM_n_log10'].to_list()
Dopaap_noMS = df_noMS['pred_QM_n_log10'].to_list()
tool_name_list = ['Dopaap_noMS']
tool_data_list = [Dopaap_noMS]
print('\n未使用质谱数据训练')
output_result(affinity_noMS, tool_name_list, tool_data_list)

affinity_withMS = df_withMS['QM_n_log10'].to_list()
Dopaap_withMS = df_withMS['pred_QM_n_log10'].to_list()
tool_name_list = ['Dopaap_withMS']
tool_data_list = [Dopaap_withMS]
print('\n使用质谱数据训练')
output_result(affinity_withMS, tool_name_list, tool_data_list)

affinity_for_tools = df_tools['QM_n_log10'].to_list()
# SMM = df_tools["SMM"].to_list()
# SMMP = df_tools["SMMPMBEC"].to_list()
# nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS
nmp3 = df_tools["nmp3_n_log10"].to_list()
nmp4 = df_tools["nmp4_n_log10"].to_list()
ANN4 = df_tools["ANN4_n_log10"].to_list()
MHCf1_2_0 = df_tools["MHCf1_2_0_n_log10"].to_list()
MHCf_MS = df_tools["MHCf_MS_n_log10"].to_list()
MHCf_noMS = df_tools["MHCf_noMS_n_log10"].to_list()

tool_name_list = ['nmp3','nmp4','ANN4','MHCf1_2_0','MHCf_MS','MHCf_noMS']
tool_data_list = [nmp3,nmp4,ANN4,MHCf1_2_0,MHCf_MS,MHCf_noMS]

print('\n其他工具')
output_result(affinity_for_tools, tool_name_list, tool_data_list)

-log10(x)计算, x单位为mol

未使用质谱数据训练
Dopaap_noMS MAE 1.1590462056718842 MSE 1.6957694102511567 RMSE 1.302217113330629 r2: -3.511057244491284 PCC 0.44656535421945204 tau 0.2956295253001475

使用质谱数据训练
Dopaap_withMS MAE 1.177402144827828 MSE 1.6948360916164509 RMSE 1.3018587064718086 r2: -3.508574445967465 PCC 0.5410649057812863 tau 0.3944672649613143

其他工具
nmp3 MAE 0.8801502772554356 MSE 1.0200892747900105 RMSE 1.0099946904761483 r2: -1.7136243201767583 PCC 0.47499811638452416 tau 0.3571028833241607
nmp4 MAE 0.8751438940423644 MSE 1.0026100116126877 RMSE 1.0013041553956958 r2: -1.6671262784573044 PCC 0.49171693463190763 tau 0.36897610101844114
ANN4 MAE 0.8955022194243347 MSE 1.038395181234815 RMSE 1.0190167718123264 r2: -1.7623213844035415 PCC 0.45298101914292055 tau 0.348152652428663
MHCf1_2_0 MAE 1.1842544515092222 MSE 1.710853648519615 RMSE 1.3079960430061 r2: -3.5511840812576603 PCC 0.5186518812929106 tau 0.38361369382474875
MHCf_MS MAE 1.0660878161904097 MSE 1.4107203479290542 RMSE 1.1877