In [1]:
import numpy as np
import pandas as pd
import os
import re
import pyreadr
import pickle

## Settings and read data

In [1]:
#### Setting
epf_dir = '/FREEPII_github/EPF-data/'
epf_dir = [epf_dir + i for i in os.listdir(epf_dir)]
epf_path = ['/'.join([i, j, j +'.rds']) for i in epf_dir for j in os.listdir(i)]
print(*epf_path, sep='\n')
print('*'*50)

split_dir = '/FREEPII_github/Split-data/'
split_dir = [split_dir + i for i in os.listdir(split_dir)]
split_path = ['/'.join([i, j]) for i in split_dir for j in os.listdir(i) if ('ref' in j)|('heldout' in j)|('exp' in j)]
print(*split_path, sep='\n')
print('*'*50)

cv_path = ['/'.join([i, j]) for i in split_dir for j in os.listdir(i) if ('csv' in j)]
print(*cv_path, sep='\n')
print('*'*50)

exp_cond = [re.sub('.rds', '', i.split('/')[-1].split('_')[-1]) for i in epf_path]
print(exp_cond)

### Read and transform EPF

In [1]:
epf_name_idx_dict = []
for cur_exp_cond in exp_cond:
    print('current data: ', cur_exp_cond)
    
    temp = list(pyreadr.read_r([i for i in epf_path if cur_exp_cond in i][0]).values())[0]
    name = temp.select_dtypes(include='object').values
    val = temp.select_dtypes(include='number').values
    
    if name.shape[1]==0:
        name = temp.index.values
    epf_name_idx_dict.append( dict(zip(name, np.arange(temp.shape[0]))) )
    print('*'*50)

del temp
del name
del val
del cur_exp_cond

print(len(epf_name_idx_dict))
print([len(i) for i in epf_name_idx_dict])

### Read splits and create mask in each cv fold

In [2]:
ref_edge = []
heldout_edge = []
delfold_edge = []
exp_edge = []

ref_label = []
heldout_label = []
delfold_label = []

train_mask = []
test_mask = []

for cur_exp_cond in exp_cond:
    print('current data: ', cur_exp_cond)
    cur_split_path = [i for i in split_path if cur_exp_cond in i]
    cur_dict = [epf_name_idx_dict[i] for i, j in enumerate(exp_cond) if j==cur_exp_cond][0]
    
    for split_idx in range(len(cur_split_path)):
        temp = list(pyreadr.read_r(cur_split_path[split_idx]).values())[0].reset_index(drop=True)
        
        if len(set(temp['Type'])) > 1:
            for type_ in set(temp['Type']):
                temp_ = temp.loc[temp['Type'] == type_].loc[:, temp.columns != 'Type']

                print('current split type: ', type_)
                print('TP number: ', temp_.loc[temp_['Label'] == 1].shape[0], '  TN number: ', temp_.loc[temp_['Label'] == 0].shape[0])

                if type_ == 'Held_out':
                    heldout_edge.append( np.array(list(zip([cur_dict[j] for j in temp_['Gene_name_A'].values], [cur_dict[j] for j in temp_['Gene_name_B'].values]))) )
                    heldout_label.append( temp_['Label'].values )
                elif type_ == 'Del_fold':
                    delfold_edge.append( np.array(list(zip([cur_dict[j] for j in temp_['Gene_name_A'].values], [cur_dict[j] for j in temp_['Gene_name_B'].values]))) )
                    delfold_label.append( temp_['Label'].values )
        else:
            if list(set(temp['Type']))[0]=='Exp':
                print('current split type: Exp')
                exp_edge.append( np.array(list(zip([cur_dict[j] for j in temp['Gene_name_A'].values], [cur_dict[j] for j in temp['Gene_name_B'].values]))) )
            
            elif list(set(temp['Type']))[0]=='Ref':
                print('current split type: Ref')
                print('TP number: ', temp.loc[temp['Label'] == 1].shape[0], '  TN number: ', temp.loc[temp['Label'] == 1].shape[0])
                
                ref_edge.append( np.array(list(zip([cur_dict[j] for j in temp['Gene_name_A'].values], [cur_dict[j] for j in temp['Gene_name_B'].values]))) )
                ref_label.append( temp['Label'].values )
                
                temp_ = pd.read_csv([cv_path[i] for i,j in enumerate(cv_path) if cur_exp_cond in j][0])
                print('current cv: ', [cv_path[i] for i,j in enumerate(cv_path) if cur_exp_cond in j][0])
                
                train_mask.append([])
                test_mask.append([])
                for fold in set(temp_['CV_fold']):
                    print('current fold: ', fold)
                    temp_1 = temp_.loc[(temp_['CV_fold']==fold)&(temp_['Type']=='Train')]
                    temp_2 = temp_.loc[(temp_['CV_fold']==fold)&(temp_['Type']=='Test')]
                    print('Train TP number in this fold: ', temp_1.loc[temp_1['Label']==1].shape[0], '  Train TN number in this fold: ', temp_1.loc[temp_1['Label']==0].shape[0])
                    print('Test TP number in this fold: ', temp_2.loc[temp_2['Label']==1].shape[0], '  Test TN number in this fold: ', temp_2.loc[temp_2['Label']==0].shape[0])

                    temp1 = pd.merge(temp.loc[: ,['Gene_name_A', 'Gene_name_B']], temp_1,  how='left', on=['Gene_name_A', 'Gene_name_B'])
                    temp2 = pd.merge(temp.loc[: ,['Gene_name_A', 'Gene_name_B']], temp_2,  how='left', on=['Gene_name_A', 'Gene_name_B'])
                    train_mask[-1].append( ~np.isnan(temp1.loc[:,['Label']].values.reshape(-1)) )
                    test_mask[-1].append( ~np.isnan(temp2.loc[:,['Label']].values.reshape(-1)) )
        print('*'*50)

del temp
del temp_
del temp_1
del temp_2
del cur_dict
del cur_exp_cond
del cur_split_path
del split_idx

In [3]:
print(len(ref_edge), len(heldout_edge), len(delfold_edge), len(exp_edge))
print(len(ref_label), len(heldout_label), len(delfold_label))
print([i.shape for i in ref_edge], [i.shape for i in ref_label])
print([i.shape for i in heldout_edge], [i.shape for i in heldout_label])
print([i.shape for i in delfold_edge], [i.shape for i in delfold_label])
print('*'*50)

print(len(train_mask), len(test_mask))
print([len(i) for i in train_mask], [len(i) for i in test_mask])
print(*[[i.sum() for i in train_mask[j]] for j in range(len(train_mask))], sep='\n')
print(*[[i.sum() for i in test_mask[j]] for j in range(len(test_mask))], sep='\n')

## Save data

In [127]:
out_path = '/FREEPII_github/input/'
if not os.path.exists(out_path):
    os.makedirs(out_path)

In [4]:
for cur_exp_cond in exp_cond:
    cur_exp_name = [i.split('/')[-3] for i in epf_path if cur_exp_cond in i][0]
    print(cur_exp_name, cur_exp_cond)
    
    out_path_ = out_path + cur_exp_name
    if not os.path.exists(out_path_):
        os.makedirs(out_path_)
    
    cur_idx = [i for i,j in enumerate(exp_cond) if j==cur_exp_cond][0]
    
    np.savez('/'.join([out_path_, 'ref_edge_' + cur_exp_cond]),      ref_edge[cur_idx])
    np.savez('/'.join([out_path_, 'heldout_edge_' + cur_exp_cond]),  heldout_edge[cur_idx])
    np.savez('/'.join([out_path_, 'delfold_edge_' + cur_exp_cond]),  delfold_edge[cur_idx])
    np.savez('/'.join([out_path_, 'exp_edge_' + cur_exp_cond]),      exp_edge[cur_idx])
    
    np.savez('/'.join([out_path_, 'ref_label_' + cur_exp_cond]),     ref_label[cur_idx])
    np.savez('/'.join([out_path_, 'heldout_label_' + cur_exp_cond]), heldout_label[cur_idx])
    np.savez('/'.join([out_path_, 'delfold_label_' + cur_exp_cond]), delfold_label[cur_idx])
    
    np.savez('/'.join([out_path_, 'train_mask_' + cur_exp_cond]),    train_mask[cur_idx])
    np.savez('/'.join([out_path_, 'test_mask_' + cur_exp_cond]),     test_mask[cur_idx])
    
    with open('/'.join([out_path_, 'name_idx_dict_' + cur_exp_cond + '.pickle']), 'wb') as f:
        pickle.dump(epf_name_idx_dict[cur_idx], f, pickle.HIGHEST_PROTOCOL)