## Get Samseg v6.1 data (via THINQ v.1.0.0-rc.11)
```
mkdir -p /home/paul/cmet/data/20200609-mclaren-1.0.0-rc.11-42-g8d976b0--take4
cd /home/paul/cmet/data/20200609-mclaren-1.0.0-rc.11-42-g8d976b0--take4
aws s3 cp s3://cmet-scratch/maclaren-cmeds/demographics.tsv .
aws s3 cp \
  --recursive \
  --exclude "*" \
  --include "*subject_info.json" \
  --include "*.pdf" \
  s3://cmet-scratch/20200609-mclaren-1.0.0-rc.11-42-g8d976b0--take4/maclaren-cmeds/ .
find . -type d -name 'cache' -exec rm -rf {} \;
```

Get rid of cached `subject_info.json` files:

```
find . -type d -name 'cache' -exec rm -rf {} \;
```

## Get FreeSurfer v6.1 data 
```
mkdir -p /home/paul/cmet/data/20200714-maclaren-fs6/
cd /home/paul/cmet/data/20200714-maclaren-fs6/
aws s3 cp s3://cmet-scratch/maclaren-cmeds/demographics.tsv .
aws s3 cp \
  --recursive \
  --exclude "*" \
  --include "*.stats" \
  s3://cmet-scratch/20200714-maclaren-fs6/ .
```

## Get v7.1 data (both samseg and aseg)
```
mkdir -p /home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/
cd /home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/
aws s3 cp s3://cmet-scratch/maclaren-cmeds/demographics.tsv .
aws s3 cp \
  --recursive \
  --exclude "*" \
  --include "*.stats" \
  s3://cmet-scratch/20201006-maclaren-fs-7.1-samseg-aseg-long/ .
```

### Split data into seperate subdirs

To faciliate recursive processing of *.stats files
```
cd /home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/
mkdir cross
mkdir long
mkdir long-base
mv sub-??_run-?? ./cross/
mv sub-??_base ./long-base
mv sub* ./long/
```

### Rename the long dirs
This is gross
```
cd ./long
for DIR_STEM in `ls -1|sed 's/\..*//'`; do  SUB_NUM=`echo $DIR_STEM|sed 's/sub-//'|sed 's/_run.*//'`; mv ${DIR_STEM}.long.sub-${SUB_NUM}_base ${DIR_STEM}; done
```

# Get Samseg v? data from container `a30d4ca`

Which corresponds to git commitID `??todo??`

```
mkdir -p /home/paul/cmet/data/20201104-maclaren-rethinq-a30d4ca
cd /home/paul/cmet/data/20201104-maclaren-rethinq-a30d4ca
aws s3 cp s3://cmet-scratch/maclaren-cmeds/demographics.tsv .
aws s3 cp \
  --recursive \
  --exclude "*" \
  --include "*subject_info.json" \
  --include "*.pdf" \
  s3://cmet-scratch/20201104-maclaren-rethinq-a30d4ca/ .
```

Get rid of cached `subject_info.json` files:

```
find . -type d -name 'cache' -exec rm -rf {} \;
```

In [2]:
import json
import os
import fnmatch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# imports find_json_files(); load_json_file(); load_dataset(); load_fs_dataset()
from cmeds import *
# imports calc_cvs(); session_permute(); monte_carlo_perm_test
from test_retest import *

In [3]:
structs_of_interest = [
    'Left-Lateral-Ventricle',
    'Left-Hippocampus',
    'Left-Amygdala',
    'Left-Caudate',
    'Left-Putamen',
    'Right-Lateral-Ventricle',
    'Right-Hippocampus',
    'Right-Amygdala',
    'Right-Caudate',
    'Right-Putamen'
]

fs61aseg_demofile = '/home/paul/cmet/data/20200714-maclaren-fs6/demographics.tsv'
fs61aseg_datadir = '/home/paul/cmet/data/20200714-maclaren-fs6/'

fs61samseg_demofile = '/home/paul/cmet/data/20200609-mclaren-1.0.0-rc.11-42-g8d976b0--take4/demographics.tsv'
fs61samseg_datadir = '/home/paul/cmet/data/20200609-mclaren-1.0.0-rc.11-42-g8d976b0--take4/'

fs71aseg_demofile = '/home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/demographics.tsv'
fs71aseg_datadir = '/home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/cross'

fs71samseg_demofile = '/home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/demographics.tsv'
fs71aseg_datadir = '/home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/cross'

rethinq_a30d4ca_demofile = '/home/paul/cmet/data/20201006-maclaren-fs-7.1-samseg-aseg-long/demographics.tsv'
rethinq_a30d4ca_datadir = '/home/paul/cmet/data/20201104-maclaren-rethinq-a30d4ca'

## Load data into pandas dataframes

In [15]:
maclaren_fs61aseg_df = load_fs_dataset(fs61aseg_datadir, fs61aseg_demofile, structs_of_interest);
maclaren_fs71aseg_df = load_fs_dataset(fs71aseg_datadir, fs61aseg_demofile, structs_of_interest);
maclaren_fs71samseg_df = load_fssamseg_dataset(fs71aseg_datadir, fs61aseg_demofile, structs_of_interest);

maclaren_rethinq_rc11_df, maclaren_fs61samseg_df_normative = \
  load_dataset(fs61samseg_datadir, fs61samseg_demofile, drop_subjects=[], vol_data_src='volume');

maclaren_rethinq_rc11_samseg6only_df, maclaren_fs61samseg_df_normative = \
  load_dataset(fs61samseg_datadir, fs61samseg_demofile, drop_subjects=[], vol_data_src='samseg_volume');

maclaren_rethinq_a30d4ca_df, maclaren_rethinq_a30d4ca_normative_df = \
  load_dataset(rethinq_a30d4ca_datadir, rethinq_a30d4ca_demofile, drop_subjects=[], vol_data_src='volume');

maclaren_rethinq_a30d4ca_samseg7only_df, maclaren_rethinq_a30d4ca_normative_df = \
  load_dataset(rethinq_a30d4ca_datadir, rethinq_a30d4ca_demofile, drop_subjects=[], vol_data_src='samseg_volume');

Dropping the following subjects []
Dropping the following subjects []
Dropping the following subjects []
Ignoring Subject (did it error out?) sub-01_run-39
Ignoring Subject (did it error out?) sub-01_run-02
Ignoring Subject (did it error out?) sub-01_run-09
Ignoring Subject (did it error out?) sub-01_run-08
Ignoring Subject (did it error out?) sub-01_run-24
Ignoring Subject (did it error out?) sub-01_run-33
Ignoring Subject (did it error out?) sub-01_run-13
Ignoring Subject (did it error out?) sub-01_run-16
Ignoring Subject (did it error out?) sub-01_run-14
Ignoring Subject (did it error out?) sub-01_run-32
Ignoring Subject (did it error out?) sub-01_run-01
Ignoring Subject (did it error out?) sub-01_run-36
Ignoring Subject (did it error out?) sub-01_run-06
Ignoring Subject (did it error out?) sub-01_run-26
Ignoring Subject (did it error out?) sub-01_run-27
Ignoring Subject (did it error out?) sub-01_run-40
Ignoring Subject (did it error out?) sub-01_run-03
Ignoring Subject (did it err

Ignoring Subject (did it error out?) sub-01_run-39
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-02
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-09
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-08
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-24
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-13
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-28
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-16
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-14
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub-01_run-29
Ignoring Subject (did it error out?) rethinq
Ignoring Subject (did it error out?) sub

In [18]:
# Add some lateral regions together so we can directly compare to table 1 in MacLaren et al.
# https://www.nature.com/articles/sdata201437/tables/2
regions = [             
            [ ['Left-Hippocampus', 'Right-Hippocampus'],'Hippocampus' ],
            [ ['Left-Lateral-Ventricle', 'Right-Lateral-Ventricle'],'Lateral-Ventricles' ],
            [ ['Left-Amygdala', 'Right-Amygdala',],'Amygdala' ],
            [ ['Left-Putamen', 'Right-Putamen'],'Putamen' ],
            [ ['Left-Caudate', 'Right-Caudate'],'Caudate' ],
          ]

maclaren_fs61aseg_df = add_regions(maclaren_fs61aseg_df,regions)
maclaren_fs71aseg_df = add_regions(maclaren_fs71aseg_df,regions)
maclaren_fs71samseg_df = add_regions(maclaren_fs71samseg_df,regions)
maclaren_rethinq_rc11_df = add_regions(maclaren_rethinq_rc11_df,regions)
maclaren_rethinq_rc11_samseg6only_df = add_regions(maclaren_rethinq_rc11_samseg6only_df,regions)
maclaren_rethinq_a30d4ca_df = add_regions(maclaren_rethinq_a30d4ca_df,regions)
maclaren_rethinq_a30d4ca_samseg7only_df = add_regions(maclaren_rethinq_a30d4ca_samseg7only_df,regions)

In [19]:
# Setup for permutation tests

# Since samseg 6 has some issues with sub1, remove from all analyses
session_list= [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
subject_list= [2,3]

# The column name that holds session info in the demographics.tsv
session_col='session'
# The column name that holds subject info in the demographics.tsv
subject_col='subject_num'

# To match with the rows of table 1 in https://www.nature.com/articles/sdata201437/tables/2
structs_of_interest = ['Hippocampus', 'Lateral-Ventricles', 'Amygdala', 'Putamen', 'Caudate']

In [29]:
# Run the permutation tests, this will take a while for large n
n = 10000

maclaren_covs_fs61aseg_df = monte_carlo_perm_test( \
  maclaren_fs61aseg_df, subject_list, session_list, subject_col, session_col, \
  structs_of_interest, n_itrs=n, method='gluer')

maclaren_covs_fs71aseg_df = monte_carlo_perm_test( \
  maclaren_fs71aseg_df, subject_list, session_list, subject_col, session_col, \
  structs_of_interest, n_itrs=n, method='gluer')

maclaren_covs_fs71samseg_df = monte_carlo_perm_test( \
  maclaren_fs71samseg_df, subject_list, session_list, subject_col, session_col, \
  structs_of_interest, n_itrs=n, method='gluer')

maclaren_covs_rethinq_rc11_df = monte_carlo_perm_test( \
  maclaren_rethinq_rc11_df, subject_list, session_list, subject_col, session_col, \
  structs_of_interest, n_itrs=n, method='gluer')

maclaren_covs_rethinq_rc11_samseg6only_df = monte_carlo_perm_test( \
  maclaren_rethinq_rc11_samseg6only_df, subject_list, session_list, subject_col, session_col, \
  structs_of_interest, n_itrs=n, method='gluer')

maclaren_covs_rethinq_a30d4ca_df = monte_carlo_perm_test( \
  maclaren_rethinq_a30d4ca_df, subject_list, session_list, subject_col, session_col, \
  structs_of_interest, n_itrs=n, method='gluer')

maclaren_covs_rethinq_a30d4ca_samseg7only_df = monte_carlo_perm_test( \
  maclaren_rethinq_a30d4ca_samseg7only_df, subject_list, session_list, subject_col, session_col, \
  structs_of_interest, n_itrs=n, method='gluer')

These tables are comparable to [Table 1 in MacLaren et al](https://www.nature.com/articles/sdata201437/tables/2)

In [44]:
maclaren_covs_fs61aseg_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals
Hippocampus,8522.69875,2.053487,2.258124,0.204637,0.2217
Lateral-Ventricles,13563.5025,2.081874,0.902939,1.178935,0.0
Amygdala,3424.3325,3.230995,3.159073,0.071922,0.7908
Putamen,9855.9875,1.895118,2.135972,0.240854,0.1628
Caudate,6863.805,1.706345,1.557038,0.149307,0.292


In [45]:
maclaren_covs_fs71aseg_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals
Hippocampus,8590.835,1.670197,1.749787,0.079589,0.5756
Lateral-Ventricles,13670.435,1.910059,0.728072,1.181987,0.0
Amygdala,3510.1775,3.013569,3.255942,0.242373,0.3148
Putamen,9883.4525,1.429833,1.496864,0.067031,0.5615
Caudate,6867.28375,1.996576,1.644559,0.352017,0.0218


In [46]:
maclaren_covs_fs71samseg_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals
Hippocampus,10770.732396,1.38953,1.069301,0.320229,0.0029
Lateral-Ventricles,23208.657477,1.672704,1.406157,0.266547,0.027
Amygdala,3779.890494,1.556568,1.675307,0.118739,0.3221
Putamen,13517.24048,8.80864,9.338202,0.529562,0.4597
Caudate,8726.35149,2.07905,2.004842,0.074207,0.6922


In [47]:
maclaren_covs_rethinq_rc11_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals
Hippocampus,8660.34375,0.732712,0.754715,0.022003,0.7117
Lateral-Ventricles,15643.9925,1.428306,0.950413,0.477893,0.0001
Amygdala,3290.56625,1.213812,1.216737,0.002925,0.9762
Putamen,11229.76125,1.103016,0.979911,0.123105,0.1658
Caudate,7051.12625,0.928835,0.789903,0.138931,0.0536


In [48]:
maclaren_covs_rethinq_rc11_samseg6only_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals
Hippocampus,9355.625,0.780972,0.827046,0.046074,0.4563
Lateral-Ventricles,17344.8375,1.450123,1.177534,0.272589,0.0156
Amygdala,3414.0125,1.155101,1.156447,0.001346,0.9892
Putamen,11434.6,1.06806,1.027789,0.040271,0.6183
Caudate,7083.5875,1.106091,0.942654,0.163436,0.0543


In [49]:
maclaren_covs_rethinq_a30d4ca_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals
Hippocampus,8627.19125,0.754608,0.75016,0.004448,0.9408
Lateral-Ventricles,15596.68625,1.434468,0.927965,0.506504,0.0
Amygdala,3293.11375,1.161294,1.142108,0.019186,0.8399
Putamen,11224.87125,1.152517,1.036997,0.11552,0.2391
Caudate,7009.9275,1.06361,0.879061,0.184549,0.0229


In [50]:
maclaren_covs_rethinq_a30d4ca_samseg7only_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals
Hippocampus,9312.0,0.80786,0.846088,0.038227,0.5503
Lateral-Ventricles,17266.9125,1.454232,1.142637,0.311595,0.0044
Amygdala,3415.975,1.145045,1.147874,0.002828,0.9766
Putamen,11579.1375,1.00529,0.912069,0.093222,0.2484
Caudate,7023.5875,1.264741,1.052177,0.212564,0.0245


In [62]:
# add version info to dataframes
n_datapoints = len(maclaren_covs_fs61aseg_df.index)
maclaren_covs_fs61aseg_df['version'] = ['fs61aseg'] * n_datapoints
maclaren_covs_fs71aseg_df['version'] = ['fs71aseg'] * n_datapoints 
maclaren_covs_fs71samseg_df['version'] = ['fs71samseg'] * n_datapoints 
maclaren_covs_rethinq_rc11_df['version'] = ['rethinq_rc11'] * n_datapoints
maclaren_covs_rethinq_rc11_samseg6only_df['version'] = ['rethinq_rc11_samseg6only'] * n_datapoints 
maclaren_covs_rethinq_a30d4ca_df['version'] = ['rethinq_a30d4ca'] * n_datapoints
maclaren_covs_rethinq_a30d4ca_samseg7only_df['version'] = ['rethinq_a30d4ca'] * n_datapoints

# concatenate
maclaren_covs_all_df = pd.concat( \
  [maclaren_covs_fs61aseg_df, \
   maclaren_covs_fs71aseg_df, \
   maclaren_covs_fs71samseg_df, \
   maclaren_covs_rethinq_rc11_df, \
   maclaren_covs_rethinq_rc11_samseg6only_df, \
   maclaren_covs_rethinq_a30d4ca_df, \
   maclaren_covs_rethinq_a30d4ca_samseg7only_df \
])



In [67]:
maclaren_covs_all_df.T

Unnamed: 0,mean-vol,total-cov,session-cov,abs-diff-cov,p-vals,mean-vol.1,total-cov.1,session-cov.1,abs-diff-cov.1,p-vals.1,...,mean-vol.2,total-cov.2,session-cov.2,abs-diff-cov.2,p-vals.2,mean-vol.3,total-cov.3,session-cov.3,abs-diff-cov.3,p-vals.3
Hippocampus,8522.7,2.05349,2.25812,0.204637,0.2217,8590.84,1.6702,1.74979,0.0795893,0.5756,...,8627.19,0.754608,0.75016,0.0044477,0.9408,9312,0.80786,0.846088,0.0382273,0.5503
Lateral-Ventricles,13563.5,2.08187,0.902939,1.17894,0,13670.4,1.91006,0.728072,1.18199,0,...,15596.7,1.43447,0.927965,0.506504,0,17266.9,1.45423,1.14264,0.311595,0.0044
Amygdala,3424.33,3.23099,3.15907,0.071922,0.7908,3510.18,3.01357,3.25594,0.242373,0.3148,...,3293.11,1.16129,1.14211,0.0191862,0.8399,3415.97,1.14505,1.14787,0.00282803,0.9766
Putamen,9855.99,1.89512,2.13597,0.240854,0.1628,9883.45,1.42983,1.49686,0.0670309,0.5615,...,11224.9,1.15252,1.037,0.11552,0.2391,11579.1,1.00529,0.912069,0.0932216,0.2484
Caudate,6863.8,1.70635,1.55704,0.149307,0.292,6867.28,1.99658,1.64456,0.352017,0.0218,...,7009.93,1.06361,0.879061,0.184549,0.0229,7023.59,1.26474,1.05218,0.212564,0.0245
version,fs61aseg,fs61aseg,fs61aseg,fs61aseg,fs61aseg,fs71aseg,fs71aseg,fs71aseg,fs71aseg,fs71aseg,...,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca,rethinq_a30d4ca


In [66]:
maclaren_covs_all_df.to_csv(r'maclaren-covs.csv')