# Data Merge
This notebook describes steps to pull the stats from the txt files and combine them to create the final data structure.

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os
import re
from collections import OrderedDict


### Location Path
Define the location of files here.

In [19]:
DATA_PATH = '/Users/stereopickles/CU_Google_Drive/COLUMBIA/DSI-Schizo/Data2020/DATA040320'
HC_PATH = f"{DATA_PATH}/HC/COBRE_prep/HC_Stats"
SZ_PATH = f"{DATA_PATH}/SZ/COBRE_prep/SZ_Stats"
HC_FAT_DTI_PATH = f'{HC_PATH}/FAT'
SZ_FAT_DTI_PATH = f'{SZ_PATH}/FAT'
HC_UF_DTI_PATH = f'{HC_PATH}/UF'
SZ_UF_DTI_PATH = f'{SZ_PATH}/UF'

In [31]:
TRACTS = ['FAT', 'UF']
CONDS = ['SZ', 'HC']

### Extract stats 
The resulting table should look like 
| subnum | admin | condition | tract | datetime | side | track_count | voxel_count | mean_length | mean_FA_trk | mean_Ang | mean_FA | mean_AD | mean_MD | mean_RD | 

In [56]:
df = pd.DataFrame(
    columns = ['subnum',
               'admin',
               'condition',
               'tract',
               'datetime_extracted',
               'yr_scanned',
               'side',
               'track_count',
               'voxel_count',
               'mean_length', 'se_length',
               'mean_FA_trk', 'se_FA_trk',
               'mean_Ang', 'se_Ang_trk', 
               'mean_FA', 'se_FA_trk',
               'mean_AD', 'se_AD_trk',
               'mean_MD', 'se_MD_trk', 
               'mean_RD', 'se_RD_trk'])

In [151]:
def find_num(str_):
    return int(re.findall('(?<=: )([.0-9]*)', str_)[0])

def find_mean_se(str_):
    mean = float([x if x != 'nan' else 0 for x in re.findall('(?<=: )([na.0-9]*)', str_)][0])
    se = float(re.findall('(?<=- )([.0-9]*)', str_)[0])
    return mean, se

def find_type(str_):
    return re.findall('(?<=DTI_)([A-Z]*)', str_)[0]

def find_year(str_):
    return int(re.findall('(?<=ses-)([0-9]*)', str_)[0][:4])

In [152]:
def extract_stats(list_, group = ['FAT', 'UF']):
    ''' From a list of lines extract information '''
    result = {}
    for i in range(0, len(list_), 12):
        grp = re.findall('(?<=: )(.*)(?=\\n)', list_[i])[0]
        trk_group = grp[:3]
        side = grp[-1]
        if trk_group in group:
            trk_cnt = find_num(list_[i+1])
            vxl_cnt = find_num(list_[i+2])
            mean_length, se_length = find_mean_se(list_[i+4])
            mean_fatrk, se_fatrk = find_mean_se(list_[i+5])
            mean_angtrk, se_angtrk = find_mean_se(list_[i+6])
            yr_scan = find_year(list_[i+7])
            stats = {}
            for j in range(7, 11):
                stats[find_type(list_[i+j])] = find_mean_se(list_[i+j])
            if len(stats) != 4: 
                print('not enough stats')
            info = {'track_count': trk_cnt, 
                    'voxel_count': vxl_cnt, 
                    'mean_length': mean_length, 
                    'se_length': se_length, 
                    'mean_fatrk': mean_fatrk, 
                    'se_fatrk': se_fatrk, 
                    'mean_angtrk': mean_angtrk, 
                    'se_angtrk': se_angtrk, 
                    'mean_AD': stats['AD'][0], 
                    'se_AD': stats['AD'][1], 
                    'mean_FA': stats['FA'][0], 
                    'se_FA': stats['FA'][1],
                    'mean_MD': stats['MD'][0], 
                    'se_MD': stats['MD'][1],
                    'mean_RD': stats['RD'][0], 
                    'se_RD': stats['RD'][1],
                    'yr_scan': yr_scan
                   }
            if trk_group in result:
                result[trk_group][side] = info
            else:
                result[trk_group] = {side: info}
        else:
            continue
    return result

In [153]:
extract_stats(line)

{'FAT': {'L': {'track_count': 0,
   'voxel_count': 0,
   'mean_length': 0.0,
   'se_length': 0.0,
   'mean_fatrk': 0.0,
   'se_fatrk': 0.0,
   'mean_angtrk': 0.0,
   'se_angtrk': 0.0,
   'mean_AD': 0.0,
   'se_AD': 0.0,
   'mean_FA': 0.0,
   'se_FA': 0.0,
   'mean_MD': 0.0,
   'se_MD': 0.0,
   'mean_RD': 0.0,
   'se_RD': 0.0,
   'yr_scan': 2013},
  'R': {'track_count': 0,
   'voxel_count': 0,
   'mean_length': 0.0,
   'se_length': 0.0,
   'mean_fatrk': 0.0,
   'se_fatrk': 0.0,
   'mean_angtrk': 0.0,
   'se_angtrk': 0.0,
   'mean_AD': 0.0,
   'se_AD': 0.0,
   'mean_FA': 0.0,
   'se_FA': 0.0,
   'mean_MD': 0.0,
   'se_MD': 0.0,
   'mean_RD': 0.0,
   'se_RD': 0.0,
   'yr_scan': 2013}}}

In [155]:
i = 0
for tr in TRACTS:
    for cnd in CONDS: 
        path = f'{DATA_PATH}/{cnd}/COBRE_prep/{cnd}_Stats/{tr}'
        sub_dirs = next(os.walk(path))[1]
        for admin in sub_dirs: 
            
            files = [x for x in os.listdir(os.path.join(path, admin)) if x.startswith('sub')]
            for f in files:
                subnum = re.findall('(?<=sub-A)([0-9]*)', f)[0]
                path2 = os.path.join(path, admin, f)
                dt = os.path.getmtime(path2)
                dt = datetime.fromtimestamp(dt).strftime('%Y-%m-%d %H:%M:%S')
                
                with open(path2, 'r') as fp: 
                    line = fp.readlines()
                    info = extract_stats(line, group = [tr])
                vals = []
                for side in info[tr].keys():
                    for k in info[tr][side].keys():
                        vals.append(info[tr][side][k])
                    full_list = [subnum, admin, cnd, tr, dt, side]
                    full_list.extend(vals)
                    print(full_list)
                    i += 1

StopIteration: 

In [39]:
from datetime import datetime


'2020-05-17 21:26:16'

In [38]:
tmp = os.path.getmtime('')
datetime.fromtimestamp(tmp).strftime('%Y-%m-%d %H:%M:%S')