In [1]:
# This is for windows
MILD_PATH = "..\\output\\bigdatacluster\\experiments\\mild_TD"
MODERATE_PATH = "..\\output\\bigdatacluster\\experiments\\moderate_TD"
SEVER_PATH = "..\\output\\bigdatacluster\\experiments\\sever_TD"

# This is for Ubuntu
# MILD_PATH = "../output/bigdatacluster/experiments/mild_TD"
# MODERATE_PATH = "../output/bigdatacluster/experiments/moderate_TD"
# SEVER_PATH = "../output/bigdatacluster/experiments/sever_TD"

In [2]:
## imports
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sp

import os
import json

# Goals
## Perform different level analysis based on the factors that split the data to perform different experiments. Currently, there is a major split based on the severity of the disorder, then an age-based split, and at last but not least, a behavioral report based split

## Base level analysis
Base level analysis is defined at the level of the output of a single behavioral report. Data utilized, Features selected, and ML models trained, in order to classify ASD in a single run, are considered as the base level results. <br><br>

For the base level analysis, we are answering 3 main questions: <br>

<b>Data related questions</b>
1. What is the total number of subjects used to run this experiment?
2. What is the number of ASD and TD within the utilized data for this experiment?<br>
<b>Features related questions</b>
3. What are the selected features to utilize in learning (sorted by importance)?
4. What are the brain regions involved in the learning process (sorted by # of occurances in all of the selected features)? (Anatomy)
5. What is the frequency of each hemishpere in selected features? (Anatomy) Is there any statistical significance between the 2 hemispheres
5. What is the frequency of each of the morphological features in selected features? (Physiology)<br>
<b>Machine learning related questions</b>
6. What is the highest classification accuracy?
7. Is the top performing classifier statstically significant than the other classifiers?

8. List the hyper parameters selected to train the top performing classifier (for reproducibility)

## Behavioral based analysis
Q. What is the similarity among features extracted from different behavioral report? <br><br>

This Question might be interesting to see if there are any similarities among behavioral reports themselves


## Age based analysis
<b> Data related questions </b><br>
    1. What is the number of subjects involved in the experiments of this age group?<br>
    2. How many ASD vs TD in this age group?<br>
<b> Features related questions </b><br>
    3. for each behavioral report, find the similarity between the features selected, brain regions frequency, morphological frequency, and hemispheres frequency.<br>
<b> ML related questions </b><br>
    4. Which age group has the highest classification accuracy?<br>
    5. Is it statistically significant?

## Severity based analysis
<b> Data related questions </b><br>
    1. What is the number of subjects involved in the experiments of this severity group?<br>
    2. How many ASD vs TD in this age group?<br>
<b> Features related questions </b><br>
    3. Over all age groups within a severity group, extract the top nominated features, brain regions, morphology, and hemispheres <br>
<b> ML related questions </b><br>
    4. Which severity group has the highest classification accuracy?<br>
    5. Is it statistically significant?

In [3]:
fldrs = []
def get2theroot(main_fldr):
    subdirs = os.listdir(main_fldr)
    for dir in subdirs:
        fullpath = os.path.join(main_fldr, dir)
        if ('ML_obj_hyperparams' in fullpath) or ('RFE_obj_params' in fullpath):
            continue
        if os.path.isdir(fullpath):
            get2theroot(fullpath)
        else:
            fldrs.append(main_fldr)

In [22]:
get2theroot(MILD_PATH)

In [23]:
mild_td_struct = list(set(fldrs))

In [40]:
base_info_dict = {}
for base_dir in mild_td_struct:
    base_files = [x for x in os.listdir(base_dir)]

    # Data related question
    if 'group_df_afterFixation.csv' in base_files:
        df = pd.read_csv(os.path.join(base_dir, 'group_df_afterFixation.csv'))
        print('we found afterFixation')
    else:
        df = pd.read_csv(os.path.join(base_dir, 'group_df_beforeFixation.csv'))
        print('we used beforeFixation')
    
    total_num_subjects = len(df)
    TD_num = df['mylabels'].value_counts()[2]
    ASD_num = df['mylabels'].value_counts()[1]
    
    # Feature selection
    rfe_info_dict = {}
    rfe_file = os.path.join(base_dir, 'RFE_obj_params','RFE.json')
    with open(rfe_file, 'r') as f:
        rfe_data = json.load(f)

    # ML 
    ml_dir = os.path.join(base_dir, 'ML_obj_hyperparams')
    for mlobjfile in [x for x in os.listdir(ml_dir) if x.endswith('.json')]:
        ml_fullpath = os.path.join(ml_dir, mlobjfile)
        with open(ml_fullpath, 'r') as f:
            ml_data = json.load(f)
        break
    break


we used beforeFixation


In [43]:
mlobjfile, ml_data


('lr_gnb.json',
 {'priors': None,
  'var_smoothing': 1e-09,
  'n_features_in_': 1,
  'epsilon_': 7.139923630788194e-11,
  'classes_': [1, 2],
  'theta_': [[0.9560420137444616], [0.3961773906546386]],
  'sigma_': [[0.0015109885201040451], [0.04855354717553998]],
  'class_count_': [3.0, 28.0],
  'class_prior_': [0.0967741935483871, 0.9032258064516129]})