In [6]:
from teim_utils import *

In [8]:
config = load_config('./TEIM/train_teim/configs/seqlevel_all.yml')
config.data.path = './TEIM/data/binding_data'
config.data.negative = 'original'

datasets = load_data(config.data)

original negative samples


In [2]:
config = load_config('./TEIM/train_teim/configs/reslevel_bothnew.yml')
config.data.path.summary = './TEIM/data/stcrdab_pdb.csv'
config.data.path.mat = './TEIM/data/contact_map'

#config.data.negative = 'original'

In [3]:
data = load_data(config.data)

Encoding cdr3s: 100%|██████████████████████| 122/122 [00:00<00:00, 84789.58it/s]
Encoding epi seqs: 100%|██████████████████| 122/122 [00:00<00:00, 118340.68it/s]
Encoding dist mat: 100%|██████████████████| 122/122 [00:00<00:00, 162291.50it/s]


In [None]:
cdr3s = []
epis = []
labels = []

dataset = data['train'][0]

for i in range(len(dataset)):
    inp = dataset[i]
    cdr3s.append(inp['cdr3'])
    epis.append(inp['epi'])
    labels.append(inp['labels'])

train_df = pd.DataFrame({'cdr3': cdr3s, 'epi': epis, 'Y': labels})

In [None]:
cdr3s = []
epis = []
labels = []

dataset = data['val'][0]

for i in range(len(dataset)):
    inp = dataset[i]
    cdr3s.append(inp['cdr3'])
    epis.append(inp['epi'])
    labels.append(inp['labels'])

val_df = pd.DataFrame({'cdr3': cdr3s, 'epi': epis, 'Y': labels})

In [None]:
train_df.to_csv('processed_data_seq_train.csv')
val_df.to_csv('processed_data_seq_val.csv')

In [1]:
import pandas as pd
import glob, os
import seaborn as sns
import matplotlib.pyplot as plt

In [36]:
def concat_dataframes_from_paths(file_paths):
    
    # Read each file into a dataframe and store in a list
    df_list = [pd.read_csv(file_path) for file_path in file_paths]

    for i in range(len(df_list)):
        df_list[i]['split'] = i

    # Concatenate the list of dataframes
    return pd.concat(df_list, ignore_index=True)

def get_results(fl_names, split='reslevel_newcdr3'):

    #splits = ['reslevel_newcdr3', 'reslevel_newepi']

    out_dict = {'Method':[], 'AUPRC': [], 'AUROC': []}

    for fl_name in fl_names:
        metric_fls = [os.path.join('results', fl_name, split, f'metrics_{i}.csv') for i in [0,2,4]]
        df_full = concat_dataframes_from_paths(metric_fls)
        num = len(df_full)
        
        out_dict['Method'] += [fl_name]*num
        out_dict['AUPRC'] += df_full.loc[:, 'auprc'].tolist()
        out_dict['AUROC'] += df_full.loc[:, 'auc'].tolist()

    df = pd.DataFrame.from_dict(out_dict)
    return df

def make_plot(df):
    df_long = df.melt(var_name="Method", value_name="AUPRC", id_vars='split')

    # Create the boxplot
    plt.figure(figsize=(10, 6))

    sns.stripplot(
        data=df_long, x='split', y='AUPRC', hue='Method',
        dodge=True, alpha=.2, legend=False,
    )
    sns.pointplot(
        data=df_long, x='split', y='AUPRC', hue='Method',
        dodge=.4, errorbar=None, linestyles="none",
        markers="_"
    )
    
    plt.xticks(rotation=45)
    plt.title("Boxplot of AUPRC values by Method")
    plt.show()

In [37]:
cdr3_df = get_results( ['teim','pretrain'], split='reslevel_newcdr3')
epi_df = get_results( ['teim','no_pretrain'], split='reslevel_newepi')

In [40]:
cdr3_df.to_csv('cdr3_results.csv')
epi_df.to_csv('epi_results.csv')

In [30]:
cdr3_df.mean()

teim        0.550676
pretrain    0.559745
split       0.986301
dtype: float64

In [31]:
epi_df.mean()

teim           0.602436
no_pretrain    0.622063
split          0.986301
dtype: float64

In [21]:
# make_plot(cdr3_df)

In [22]:
# make_plot(epi_df)