In [1]:
import numpy as np
import pandas as pd
import os
import re
import pyreadr
import pickle

## Settings and read data

In [1]:
#### Setting
epf_dir = '/FREEPII_github/EPF-data/'
epf_dir = [epf_dir + i for i in os.listdir(epf_dir)]
epf_path = ['/'.join([i, j, j +'.rds']) for i in epf_dir for j in os.listdir(i)]
print(*epf_path, sep='\n')
print('*'*50)

split_dir = '/FREEPII_github/Split-data/'
split_dir = [split_dir + i for i in os.listdir(split_dir)]
split_path = ['/'.join([i, j]) for i in split_dir for j in os.listdir(i) if ('ref' in j)|('heldout' in j)|('exp' in j)]
print(*split_path, sep='\n')
print('*'*50)

seq_path = '/FREEPII_github/FCGR-data/uniprot_seq_Human_FCGR_16x.rds'
name_path = '/FREEPII_github/UniProt-data/uniprot_gene_Human.rds'
print(seq_path)
print(name_path)

exp_cond = [re.sub('.rds', '', i.split('/')[-1].split('_')[-1]) for i in epf_path]
print(exp_cond)

### Read sequence and name information

In [1]:
seq_df = list(pyreadr.read_r(seq_path).values())[0]
name_df = list(pyreadr.read_r(name_path).values())[0]

print(seq_df.shape)
print(seq_df.head())
print('*'*50)

print(name_df.shape)
print(name_df.head())
print('*'*50)

match_seq_num = []
for idx in range(name_df.select_dtypes(include='object').shape[1]):
    match_seq_num.append(len( set(name_df.iloc[:, idx].values)&set(seq_df.columns.values) ))
print(list(zip(name_df.select_dtypes(include='object').columns.values.tolist(), match_seq_num)))

best_match_seq_name = name_df.select_dtypes(include='object').columns.values[np.argmax(match_seq_num)]
print('best match seq name: ', best_match_seq_name)

### Read EPF, select and transform sequence

In [2]:
seq_in_exp_idx_list = []
seq_off_exp_idx_list = []
seq = []

for cur_exp_cond in exp_cond:
    print('current data: ', cur_exp_cond)
    
    temp = list(pyreadr.read_r([i for i in epf_path if cur_exp_cond in i][0]).values())[0]
    name = temp.select_dtypes(include='object').values
    
    if name.shape[1]==0:
        name = temp.index.values
    
    cur_dict = dict(zip(name, np.arange(temp.shape[0])))
    
    match_epf_num = []
    for name_idx in range(name_df.select_dtypes(include='object').shape[1]):
        match_epf_num.append(len( set(name_df.iloc[:, name_idx].values)&set(name) ))
    print(list(zip(name_df.select_dtypes(include='object').columns.values.tolist(), match_epf_num)))
    
    best_match_epf_name = name_df.select_dtypes(include='object').columns.values[np.argmax(match_epf_num)]
    print('best match epf_ name: ', best_match_epf_name)
    
    val = None
    cur_seq_in_exp_idx_list = []
    cur_seq_off_exp_idx_list = []
    
    for cur_name in name:
        cur_uname_list = name_df.loc[name_df[best_match_epf_name] == cur_name][best_match_seq_name].values.tolist()
        if len(cur_uname_list) > 0:
            if len(set(cur_uname_list)&set(seq_df.columns.values)) > 0:
                cur_seq_in_exp_idx_list.append(cur_dict[cur_name])
                cur_seq = np.nan_to_num(seq_df[cur_uname_list].values / seq_df[cur_uname_list].values.sum(0)).mean(1).reshape(1, -1)
                if val is None:
                    val = cur_seq
                else:
                    val = np.concatenate([val, cur_seq], 0)
            else:
                cur_seq_off_exp_idx_list.append(cur_dict[cur_name])
        else:
            cur_seq_off_exp_idx_list.append(cur_dict[cur_name])
        
        if (np.where(name==cur_name)[0][0] + 1) % 500 ==0:
            print('seq record: ', len(cur_seq_in_exp_idx_list), '/', val.shape[0], ' non seq record', len(cur_seq_off_exp_idx_list), ' total: ', temp.shape[0])
    
    print('seq raw size: ', val.shape)
    seq.append(val)
    seq_in_exp_idx_list.append(cur_seq_in_exp_idx_list)
    seq_off_exp_idx_list.append(cur_seq_off_exp_idx_list)
    print('*'*50)

del cur_exp_cond
del temp
del name
del cur_dict
del match_epf_num
del best_match_epf_name
del cur_name
del cur_uname_list
del cur_seq_in_exp_idx_list
del cur_seq_off_exp_idx_list
del cur_seq
del val

In [3]:
print(len(seq), len(seq_in_exp_idx_list), len(seq_off_exp_idx_list))
print([j.shape for j in seq])
print([len(i) for i in seq_in_exp_idx_list])
print([len(i) for i in seq_off_exp_idx_list])
print([[len(set(seq_in_exp_idx_list[i])&set(seq_off_exp_idx_list[i])), len(set(seq_in_exp_idx_list[i] + seq_off_exp_idx_list[i]))] for i in range(len(seq_in_exp_idx_list))])

## Save data

In [20]:
out_path = '/FREEPII_github/input/'
if not os.path.exists(out_path):
    os.makedirs(out_path)

In [4]:
for cur_exp_cond in exp_cond:
    cur_exp_name = [i.split('/')[-3] for i in epf_path if cur_exp_cond in i][0]
    print(cur_exp_name, cur_exp_cond)
    
    out_path_ = out_path + cur_exp_name
    if not os.path.exists(out_path_):
        os.makedirs(out_path_)
    
    cur_idx = [i for i,j in enumerate(exp_cond) if j==cur_exp_cond][0]
    
    np.savez('/'.join([out_path_, 'feature_seq_FCGR_16x_' + cur_exp_cond]), seq[cur_idx])
    np.savez('/'.join([out_path_, 'seq_in_exp_idx_' + cur_exp_cond]),    seq_in_exp_idx_list[cur_idx])
    np.savez('/'.join([out_path_, 'seq_off_exp_idx_' + cur_exp_cond]),   seq_off_exp_idx_list[cur_idx])