In [1]:
from os import listdir as ls
from os import makedirs as mkd
from plotly.subplots import make_subplots

from scipy.io.arff import loadarff
from glob import glob
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import pandas as pd 
import numpy as np
import re
import json

In [2]:
grp=['Methylome', 'RNA1', 'RNA2','RNA_mix']
inp_src='../Input/prep/'
inp_raw='../Input/raw/'
out_src='../Output/Script1/Results/'


# Always run above

# 1. Boostrapped t-test

In [1]:
#Reading config file for each dataset
my_files=glob('config_*.json')# Replace the pattern to match file name as per your convention
print('Total number of datasets ',len(my_files))


In [2]:
# Runing following code will create a bs_test_[dataset] for each dataset, which has p-value for each samples subset of features
for file in tqdm(my_files):
    !python get_bs_ttest.py --inp_file {file}
    

# 2. Scoring using the bootstrap matrix

In [3]:
my_files=glob(inp_src+'*/bs_test*.csv')# Replace the pattern to match file name as per your convention
len(my_files)

13

In [5]:
# calculate the new significance score
def get_score(x):
    
    '''
    get_score function calulcate filter score by taking the average of significant p-values.
    '''
    sig_score = (x>2).sum()/1000
    
    
    return sig_score

## 2.1 Calculating score from boot strap matrix filelt

In [3]:
# Populate a list with filter score calculated for each dataset
my_scores = []
for itm in tqdm(my_files):
    score_df = pd.read_csv(itm)
    gene_scores = score_df.apply(lambda x: get_score(x), axis=0)
    my_scores.append(gene_scores)

## 2.2 Saving the score from list obtained in matched name file

In [4]:
# Use my_files to iterate over all the dataset and with corresponding score from my_scores to obtain multiple cutoff points

cnt = 0
for my_file in my_files:
    
    base_loc = my_file.split('bs_test')[0]# result_location
    file_key = my_file.split('bs_test_')[-1].split('.')[0]#extracting file name
    
    #creating complete score file
    score_file = my_scores[cnt]#fetching score
    score_size = len(score_file)
    score_all_df = pd.DataFrame({"Genes":score_file.index,"Score":score_file.values}).sort_values('Score',ascending =False)
    sorted_score = score_file.sort_values(ascending = False)
    
    #Storing the result
    score_all_df.to_csv(base_loc+f'score_bs_complete_{file_key}.csv',index=False)
    

    

# 3. Performing recurive feature elimination on the filtered subset

In [5]:
#Create a list of filtered feature files,on which RFE has to be performed
pattern = 'method2_score_bs_filter_toprnk_4percent_[!GSE]'# Replace the pattern to match file name as per your convention
my_files=glob(f'{inp_src}*/{pattern}*.csv')
len(my_files)

In [6]:
# Performing RFE on the final features
for feat_file in tqdm(my_files):
    f_tag = feat_file.split('percent_')[-1].split('.')[0]
    res_name = feat_file.split('\\')[-1].split('.')[0]
    config_file = f'./configs/config_{f_tag}.json'
   
    !python perform_rfe.py --model svm --inp_file {config_file} --gene_file {feat_file} --res_file {res_name} --step_size 1

# 4. Comparison of models for final features

In [7]:
#Create a list of final features ready for comparision with different models
pattern = 'RFE_svm_step1_score_bs_filter_toprnk_4percent'# Replace the pattern to match file name as per your convention
my_files=glob(f'{inp_src}*/{pattern}*.csv')
len(my_files)

In [8]:
# Performing comparison on final feature for multiple ML models
for feat_file in tqdm(my_files):
    f_tag = feat_file.split('_toprnk_')[-1].split('.')[0]
    config_tag = feat_file.split('percent_')[-1].split('.')[0]
    config_file = f'./configs/config_{config_tag}.json'
    res_name = f'{pattern}_{f_tag}'
    #print(config_file,feat_file,res_name)
    !python compare_ml_models.py --inp_file {config_file} --gene_file {feat_file} --res_file {res_name}

# ----------------------------------------------End----------------------------------------------