In [1]:
import os
import glob
import pandas as pd

data_dir = 'multimers'
csv_list = glob.glob(os.path.join(data_dir, '*.csv'))
lab_name = 'CSSB'
group_abbreviation = {'CSSB-Human': 'human', 'CSSB_FAKER': 'FAKER', 'CSSB_experimental': 'EXP'}
ab_targets = ['H1204', 'H1215', 'H1222', 'H1232', 'H1233', 'H1244']

tmscore_list = []
ranking_list = []
z_score_list = []

for csv_fn in csv_list:
    target_id = os.path.basename(csv_fn)[:-4].replace('o', '')
    
    if int(target_id[1]) != 1:
        continue
        
    df = pd.read_csv(csv_fn, na_values=['', ' ', '-', 'NA', 'null']).dropna()
    df['Ranking'] = df['TMscore'].rank(method='min', ascending=False).astype(int)
    mean_tm, std_tm = df['TMscore'].mean(), df['TMscore'].std()
    df['Zscore'] = (df['TMscore'] - mean_tm) / std_tm
    
    df = df[df['Group'].str.contains(lab_name, na=False)]
    df['Model'] = df['Group'].map(group_abbreviation) + '_' + df['Model'].str.split('_').str[-1].str[0]
    df = df.set_index('Model').sort_index(ascending=True)[['TMscore', 'Ranking', 'Zscore']]
    
    tmscore_dict = {'target_id' : target_id}
    ranking_dict = {'target_id' : target_id}
    z_score_dict = {'target_id' : target_id}
    
    for i, row in df.iterrows():
        tmscore_dict[i] = row['TMscore']
        ranking_dict[i] = row['Ranking']
        z_score_dict[i] = row['Zscore']
    
    tmscore_list.append(tmscore_dict)
    ranking_list.append(ranking_dict)
    z_score_list.append(z_score_dict)

In [2]:
def compare_ranks(row):
    if row['EXP_best_rank'] < row['human_best_rank']:
        return 'win'
    elif row['EXP_best_rank'] > row['human_best_rank']:
        return 'lose'
    else:
        return 'tie'
    
tmscore_df = pd.DataFrame(tmscore_list).dropna()
ranking_df = pd.DataFrame(ranking_list).dropna().astype(int, errors='ignore')
ranking_df['EXP_best_rank'] = ranking_df[['EXP_1', 'EXP_2', 'EXP_3', 'EXP_4', 'EXP_5']].min(axis=1)
ranking_df['human_best_rank'] = ranking_df[['human_1', 'human_2', 'human_3', 'human_4', 'human_5']].min(axis=1)
ranking_df['EXP_best'] = ranking_df[['EXP_1', 'EXP_2', 'EXP_3', 'EXP_4', 'EXP_5']].idxmin(axis=1)
ranking_df['human_best'] = ranking_df[['human_1', 'human_2', 'human_3', 'human_4', 'human_5']].idxmin(axis=1)
ranking_df['result'] = ranking_df.apply(compare_ranks, axis=1)
z_score_df = pd.DataFrame(z_score_list).dropna()

tmscore_df.to_csv(data_dir + '_tmscore.csv', index=False)
ranking_df.to_csv(data_dir + '_ranking.csv', index=False)
z_score_df.to_csv(data_dir + '_z_score.csv', index=False)

In [3]:
ranking_df['result'].value_counts()

result
lose    18
win     16
tie      2
Name: count, dtype: int64

In [4]:
no_ab_df = ranking_df[~ranking_df['target_id'].isin(ab_targets)]

In [5]:
no_ab_df['result'].value_counts()

result
lose    15
win     14
tie      1
Name: count, dtype: int64

In [6]:
final_df = no_ab_df[['target_id', 'result', 'EXP_best', 'human_best', 'EXP_best_rank', 'human_best_rank']]

In [7]:
final_df[final_df['result'] == 'win']

Unnamed: 0,target_id,result,EXP_best,human_best,EXP_best_rank,human_best_rank
1,H1202,win,EXP_4,human_1,91,136
3,T1206,win,EXP_2,human_5,115,172
5,H1213,win,EXP_1,human_1,21,24
8,T1218,win,EXP_4,human_4,147,257
10,H1220,win,EXP_1,human_4,2,24
14,H1227,win,EXP_2,human_4,2,12
15,H1229,win,EXP_1,human_1,110,202
22,T1237,win,EXP_3,human_5,33,42
23,T1240,win,EXP_5,human_4,86,169
29,H1258,win,EXP_5,human_5,13,24
