In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import numpy as np
import scipy as sc
import pandas as pd
import time
import os
import gc

In [3]:
prop = "HOMO_energy"

loss_f="BinaryCrossentropy"
#loss_f="MSE"

score_file_path = f"/content/gdrive/MyDrive/ModelParameters_Pairwise/{prop}_{loss_f}/structures_scores_{loss_f}.csv"

In [4]:
df_resultado = pd.read_csv(score_file_path)
df_resultado_nodup  = df_resultado[['Molecula','Conformero', 'Isomero']].drop_duplicates()

In [5]:
df_resultado.head(10)

Unnamed: 0.1,Unnamed: 0,Molecula,FileName,HOMO_energy,Predito,Isomero,Conformero
0,0,molecula_31,Geom-m716-i1-c2-d8.xyz,-7.10024,0.001202,i1,c2
1,1,molecula_31,Geom-m716-i1-c2-d31.xyz,-7.10969,0.000878,i1,c2
2,2,molecula_31,Geom-m716-i1-c2-d56.xyz,-7.02815,0.002549,i1,c2
3,3,molecula_31,Geom-m716-i2-c1-d10.xyz,-6.9662,0.002698,i2,c1
4,4,molecula_31,Geom-m716-i2-c2-d4.xyz,-6.9413,0.01007,i2,c2
5,5,molecula_31,Geom-m716-i1-c1-d18.xyz,-7.18289,7.9e-05,i1,c1
6,6,molecula_31,Geom-m716-i1-c3-d4.xyz,-7.06057,0.001619,i1,c3
7,7,molecula_31,Geom-m716-i1-c2-d28.xyz,-7.26798,1.2e-05,i1,c2
8,8,molecula_31,Geom-m716-i2-c1-d73.xyz,-7.25751,0.00032,i2,c1
9,9,molecula_31,Geom-m716-i2-c2-d41.xyz,-7.02008,0.00213,i2,c2


In [6]:
lst_spearman = list()
lst_mae_rank = list()
lst_ndcg = list()

In [7]:
qtde_first_place = 0
for j in range(df_resultado_nodup.shape[0]):
  molecula = df_resultado_nodup.iloc[j]['Molecula']
  conformero = df_resultado_nodup.iloc[j]['Conformero']
  isomero = df_resultado_nodup.iloc[j]['Isomero']

  query = "Molecula == '" + molecula + "' and Conformero == '" + conformero + "' and Isomero == '" + isomero + "'"
  df_aux = df_resultado.query(query).copy()
  vec_rank = [x+1 for x in range(df_aux.shape[0])]

  # ------ Create the true ranking -------
  df_aux.sort_values(by = prop, inplace=True)
  df_aux['rank_true'] = vec_rank
  # ---------------------------------------

  # ------ Create the predicted ranking -------
  df_aux.sort_values(by = 'Predito', inplace=True)
  df_aux['rank_predito'] = vec_rank

  if df_aux.query("rank_true == 1 and rank_predito ==1").shape[0] >0:
    qtde_first_place = qtde_first_place +1


  if df_aux.shape[0]<=1:
    print("Somente uma linha no dataset:", j)
  else:
    spearman_rank = np.corrcoef(df_aux['rank_predito'].values, df_aux['rank_true'].values)[0,1]
    mae_rank = np.mean(abs(df_aux['rank_predito'].values - df_aux['rank_true'].values))
    dcg = sum(df_aux[['rank_true', 'rank_predito']].apply(lambda x: np.exp(-x['rank_true'])/(1+np.log(x['rank_predito'])), axis =1).values)
    idcg = sum(df_aux['rank_true'].apply(lambda x: np.exp(-x)/(1+np.log(x))).values)
    lst_ndcg.append(dcg/idcg)
    lst_spearman.append(spearman_rank)
    lst_mae_rank.append(mae_rank)

In [8]:
df_resultado_consolidado = pd.DataFrame()
df_resultado_consolidado['Spearman'] = lst_spearman
df_resultado_consolidado['MAE Rank'] = lst_mae_rank
df_resultado_consolidado['nDCG'] = lst_ndcg

In [9]:
df_resultado_consolidado.head()

Unnamed: 0,Spearman,MAE Rank,nDCG
0,0.816727,12.970297,0.580697
1,0.874758,10.712871,0.346086
2,0.885428,10.29703,0.620133
3,0.832137,11.465347,0.446849
4,0.768002,14.059406,0.384643


In [10]:
stdev = 1.96*np.std(df_resultado_consolidado['Spearman'].values)/np.sqrt(len(df_resultado_consolidado['Spearman'].values))
lb_spearman = np.mean(df_resultado_consolidado['Spearman'].values) - stdev
ub_spearman = np.mean(df_resultado_consolidado['Spearman'].values) + stdev
print("Media Spearmann:", lb_spearman, ub_spearman)

Media Spearmann: 0.8196264543840842 0.8284561980504058


In [11]:
stdev = 1.96*np.std(df_resultado_consolidado['MAE Rank'].values)/np.sqrt(len(df_resultado_consolidado['MAE Rank'].values))
lb_mae = np.mean(df_resultado_consolidado['MAE Rank'].values) - stdev
ub_mae = np.mean(df_resultado_consolidado['MAE Rank'].values) + stdev

#print("Media MAE Rank", np.mean(df_resultado_consolidado['MAE Rank'].values))
print("Mae Ranking:", lb_mae, ub_mae)

Mae Ranking: 12.209896545151077 12.537136052845563


In [12]:
stdev = 1.96*np.std(df_resultado_consolidado['nDCG'].values)/np.sqrt(len(df_resultado_consolidado['nDCG'].values))
lb_ndcg = np.mean(df_resultado_consolidado['nDCG'].values) - stdev
ub_ndcg = np.mean(df_resultado_consolidado['nDCG'].values) + stdev

#print("Media nDCG", np.mean(df_resultado_consolidado['nDCG'].values))
print("nDCG:", lb_ndcg, ub_ndcg)

nDCG: 0.6603542388895894 0.6797961695331819


In [13]:
print("Percentage first place:", np.round(100.0*qtde_first_place/df_resultado_nodup.shape[0],2), "%")

Percentage first place: 34.31 %


In [14]:
from google.colab import runtime
runtime.unassign()