In [44]:
import pandas as pd
from sklearn.metrics import ndcg_score
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics import matthews_corrcoef
from pathlib import Path


In [2]:
PATH_FOLDER = Path('giacomelli_boltz2_100/REPL3/Round1/df_sorted_all.csv')
LOG_NAME = "giacomelli_REPL3_nDCG@k_rho_p_k10_RF100_PCA100.csv"
reference = pd.read_csv('giacomelli_ref.csv')
k_value = 10
cutoff = 1

In [69]:
reference['score_shifted'] = reference['score'] - reference['score'].min()
reference['binary_true'] = [1 if x > cutoff else 0 for x in reference['score']]
print(f"minimum value {reference['score'].min()}")
print(f"maximum value {reference['score'].max()}")
print(f"minimum_shifted {reference['score_shifted'].min()}")
print(f"maximum_shifted {reference['score_shifted'].max()}")
reference

minimum value -3.943922005
maximum value 2.533066897
minimum_shifted 0.0
maximum_shifted 6.476988902


Unnamed: 0,hgvs_pro,score,activity_scaled,score_shifted,binary_true
0,M1A,-0.788753,0.487135,3.155169,0
1,M1C,-1.969077,0.304902,1.974845,0
2,M1D,0.536895,0.691806,4.480817,0
3,M1E,1.227243,0.798390,5.171165,1
4,M1F,0.536895,0.691806,4.480817,0
...,...,...,...,...,...
7443,D393S,1.664632,0.865920,5.608554,1
7444,D393T,1.358976,0.818729,5.302898,1
7445,D393V,1.182561,0.791492,5.126483,1
7446,D393W,1.583180,0.853344,5.527102,1


In [42]:
rows = []

for i in range(1, 11):
    if not PATH_FOLDER.exists():
        print('Check path')
            
        break
        
    else:
        df_sorted_all = pd.read_csv( PATH_FOLDER.parent.parent / f'Round{i}' / PATH_FOLDER.name)

        suggested = df_sorted_all[['variant' , 'y_pred']].copy()
        
        suggested['y_score'] = suggested['variant'].map(reference.set_index('hgvs_pro')['score_shifted'])

        suggested['binary_true'] = suggested['variant'].map(reference.set_index('hgvs_pro')['binary_true'])

        suggested['binary_predicted'] = [1 if x > cutoff else 0 for x in suggested['y_pred']]

        suggested.sort_values(by='y_pred', ascending=False, inplace=True)

        suggested.reset_index(drop=True, inplace=True)

        #Reshape for ndcg
        y_true = suggested['y_score'].values.reshape(1, -1)
        y_score = suggested['y_pred'].values.reshape(1, -1)

        #Binary for MCC
        y_true_binary = suggested['binary_true'].values.astype(int)
        y_pred_binary = suggested['binary_predicted'].values.astype(int)

        #Spearman/p-value
        rho, pval = spearmanr(suggested['y_score'].iloc[:k_value], suggested['y_pred'].iloc[:k_value])

        #nDCG@k
        score = ndcg_score(y_true , y_score , k=k_value)

        #MCC
        mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
        
        row_log = {'': f'Round{i}', 'nDCG': score, 'Spearman_rho': rho, 'Spearman_pval': pval, 'MCC': mcc}

        rows.append(row_log)
             
        print(row_log)

log_df = pd.DataFrame(rows) 
log_df.to_csv(LOG_NAME, index=False)


{'': 'Round1', 'nDCG': 0.767001536007717, 'Spearman_rho': 0.2727272727272727, 'Spearman_pval': 0.44583834154275137, 'MCC': 0.031146655193741102}
{'': 'Round2', 'nDCG': 0.8037781247488506, 'Spearman_rho': 0.6606060606060605, 'Spearman_pval': 0.03758837757140939, 'MCC': 0.031146655193741102}
{'': 'Round3', 'nDCG': 0.8630953852865065, 'Spearman_rho': 0.5757575757575757, 'Spearman_pval': 0.08155281477260236, 'MCC': 0.06230592931000352}
{'': 'Round4', 'nDCG': 0.8598520311723054, 'Spearman_rho': 0.7818181818181817, 'Spearman_pval': 0.007547007781067878, 'MCC': 0.09266635053070464}
{'': 'Round5', 'nDCG': 0.8918567578840845, 'Spearman_rho': 0.7333333333333332, 'Spearman_pval': 0.01580059625057158, 'MCC': 0.1277805269211782}
{'': 'Round6', 'nDCG': 0.8938730797443956, 'Spearman_rho': 0.8303030303030302, 'Spearman_pval': 0.0029402270232795065, 'MCC': 0.13476604161075478}
{'': 'Round7', 'nDCG': 0.8944226221019326, 'Spearman_rho': 0.8666666666666665, 'Spearman_pval': 0.0011735381801554687, 'MCC': 0

In [34]:
suggested

Unnamed: 0,variant,y_pred,y_score,binary_true,binary_predicted
0,G117N,1.118495,5.392732,1,1
1,H368S,1.029193,5.136152,1,1
2,T284Y,0.944952,5.091413,1,0
3,L369Y,0.905223,5.130412,1,0
4,H115E,0.881207,5.139941,1,0
...,...,...,...,...,...
7403,C275Y,-1.466926,2.504021,0,0
7404,P152C,-1.472827,2.317928,0,0
7405,A276R,-1.488104,2.132936,0,0
7406,L344N,-1.542031,1.952461,0,0


In [70]:
df_sorted_all_test = pd.read_csv('giacomelli_boltz2_100/REPL3/Round5/df_sorted_all.csv').head(10)

#esperimento
#df_sorted_all_test = df_sorted_all_test[df_sorted_all_test['y_actual'].isna()].head(10)

df_sorted_all_test['binary_predicted'] = 1

df_sorted_all_test['y_actual'] = df_sorted_all_test['variant'].map(reference.set_index('hgvs_pro')['score'])

df_sorted_all_test['binary_actual'] = [1 if x > cutoff else 0 for x in df_sorted_all_test['y_actual']]

df = df_sorted_all_test.drop(['Unnamed: 0'], axis=1).copy()

df

Unnamed: 0,variant,y_pred,y_actual,y_actual_scaled,y_actual_binary,dist_metric,std_predictions,binary_predicted,binary_actual
0,S367F,1.36751,1.69089,1.0,1.0,17.768733,0.0,1,1
1,G374P,1.338307,1.452597,0.920395,1.0,21.471072,0.0,1,1
2,Q375T,1.306178,1.485655,0.931438,1.0,13.31603,0.0,1,1
3,L369G,1.299201,1.459392,0.922665,1.0,27.304873,0.0,1,1
4,Q375P,1.194835,1.288033,0.86542,1.0,19.552276,0.0,1,1
5,K370P,1.170758,1.17477,0.827583,1.0,17.265166,0.0,1,1
6,L369S,1.153153,1.31813,0.875474,1.0,19.843681,0.0,1,1
7,K381P,1.132407,1.155508,0.821148,1.0,17.175137,0.0,1,1
8,S362K,1.123806,1.339242,0.882527,1.0,19.324538,0.0,1,1
9,S371C,1.121907,1.263028,0.857067,1.0,15.209694,0.0,1,1


In [None]:
y_predicted_bin_test = df['binary_predicted'].values.astype(int)
y_actual_bin_test = df['binary_actual'].values.astype(int)

#rho , pval = spearmanr(y_actual_bin_test, y_predicted_bin_test)

mcc_test = matthews_corrcoef(y_actual_bin_test, y_predicted_bin_test)

print(f'{mcc_test:5f}')

0.000000


In [None]:
reference_top10 = reference.sort_values(by='score', ascending=False).head(10)

reference_top10['binary_true'] = [1 if x > cutoff else 0 for x in reference_top10['score']]

reference_top10['score_shifted'] = reference_top10['score'] - reference['score'].min()

reference_top10.head() , df_sorted_all_test.head()

Unnamed: 0,hgvs_pro,score,activity_scaled,score_shifted
2225,A119E,2.533067,1.0,6.476989
628,P34C,2.099894,0.933121,6.043816
7017,S371H,1.971965,0.91337,5.915887
7440,D393P,1.950754,0.910095,5.894676
7210,K381M,1.916446,0.904798,5.860368
2128,L114A,1.901525,0.902495,5.845447
942,I50P,1.88192,0.899468,5.825842
6923,S366I,1.877319,0.898757,5.821241
7014,S371E,1.875403,0.898461,5.819325
7435,D393I,1.860547,0.896168,5.804469


In [56]:
y_true_top = reference_top10['score'].values
y_predicted_top = df['y_actual'].values

ndcg_test = ndcg_score(y_true_top.reshape(1, -1) , y_predicted_top.reshape(1, -1))

srho , spval = spearmanr(y_true_top, y_predicted_top)

prho , ppval = pearsonr(y_true_top, y_predicted_top)

print(f'Spearman: rho {srho}, pval{spval}')

print(f'Pearson: rho {prho}, pval{ppval}')

print(f'nDCG@k : {ndcg_test}')

Spearman: rho 0.7333333333333332, pval0.01580059625057158
Pearson: rho 0.8311646976887862, pval0.002884138286397513
nDCG@k : 0.9967086202662444


In [59]:
print(f'true : {y_true_top}' , '\n' f'predicted . {y_predicted_top}') 

true : [2.5330669  2.09989438 1.97196537 1.95075376 1.91644559 1.90152491
 1.88191958 1.87731884 1.87540311 1.86054673] 
predicted . [1.69089044 1.45259748 1.48565513 1.45939217 1.28803328 1.1747695
 1.31813036 1.15550802 1.33924173 1.26302839]
