# Data Merge
This notebook describes steps to pull the stats from the txt files and combine them to create the final data structure.

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os
import re
from collections import OrderedDict
from datetime import datetime


### Location Path
Define the location of files here.

In [2]:
DATA_PATH = '/Users/stereopickles/CU_Google_Drive/COLUMBIA/DSI-Schizo/Data2020/DATA040320'
HC_PATH = f"{DATA_PATH}/HC/COBRE_prep/HC_Stats"
SZ_PATH = f"{DATA_PATH}/SZ/COBRE_prep/SZ_Stats"
HC_FAT_DTI_PATH = f'{HC_PATH}/FAT'
SZ_FAT_DTI_PATH = f'{SZ_PATH}/FAT'
HC_UF_DTI_PATH = f'{HC_PATH}/UF'
SZ_UF_DTI_PATH = f'{SZ_PATH}/UF'

In [3]:
TRACTS = ['FAT', 'UF']
CONDS = ['SZ', 'HC']

### Extract stats 
The resulting table should look like 
| subnum | admin | condition | tract | datetime | side | track_count | voxel_count | mean_length | mean_FA_trk | mean_Ang | mean_FA | mean_AD | mean_MD | mean_RD | 

In [5]:
def find_num(str_):
    return int(re.findall('(?<=: )([.0-9]*)', str_)[0])

def find_mean_se(str_):
    mean = float([x if x != 'nan' else 0 for x in re.findall('(?<=: )([na.0-9]*)', str_)][0])
    se = float(re.findall('(?<=- )([.0-9]*)', str_)[0])
    return mean, se

def find_type(str_):
    return re.findall('(?<=DTI_)([A-Z]*)', str_)[0]

def find_year(str_):
    return int(re.findall('(?<=ses-)([0-9]*)', str_)[0][:4])

In [49]:
def extract_stats(list_, group = {'FAT', 'UF'}):
    ''' From a list of lines extract information '''
    result = {}
    for i in range(0, len(list_), 12):
        grp = re.findall('(?<=: )(.*)(?=\\n)', list_[i])[0]

        trk_group = re.findall('(FAT|UF)', grp)[0]
        side = grp[-1]
        if trk_group in group:
            trk_cnt = find_num(list_[i+1])
            vxl_cnt = find_num(list_[i+2])
            mean_length, se_length = find_mean_se(list_[i+4])
            mean_fatrk, se_fatrk = find_mean_se(list_[i+5])
            mean_angtrk, se_angtrk = find_mean_se(list_[i+6])
            yr_scan = find_year(list_[i+7])
            stats = {}
            for j in range(7, 11):
                stats[find_type(list_[i+j])] = find_mean_se(list_[i+j])
            if len(stats) != 4: 
                print('not enough stats')
            info = {'track_count': trk_cnt, 
                    'voxel_count': vxl_cnt, 
                    'mean_length': mean_length, 
                    'se_length': se_length, 
                    'mean_fatrk': mean_fatrk, 
                    'se_fatrk': se_fatrk, 
                    'mean_angtrk': mean_angtrk, 
                    'se_angtrk': se_angtrk, 
                    'mean_AD': stats['AD'][0], 
                    'se_AD': stats['AD'][1], 
                    'mean_FA': stats['FA'][0], 
                    'se_FA': stats['FA'][1],
                    'mean_MD': stats['MD'][0], 
                    'se_MD': stats['MD'][1],
                    'mean_RD': stats['RD'][0], 
                    'se_RD': stats['RD'][1],
                    'yr_scan': yr_scan
                   }
            if trk_group in result:
                result[trk_group][side] = info
            else:
                result[trk_group] = {side: info}
        else:
            continue
    return result

In [79]:
df = pd.DataFrame(
    columns = ['subnum',
               'admin',
               'condition',
               'tract',
               'datetime_extracted',
               'side',
               'track_count', 
                    'voxel_count', 
                    'mean_length', 
                    'se_length', 
                    'mean_fatrk',
                    'se_fatrk',
                    'mean_angtrk',
                    'se_angtrk',
                    'mean_AD',
                    'se_AD',
                    'mean_FA',
                    'se_FA',
                    'mean_MD',
                    'se_MD',
                    'mean_RD',
                    'se_RD',
                    'yr_scan'])

In [92]:
i = 0
for tr in TRACTS:
    for cnd in CONDS: 
        path = f'{DATA_PATH}/{cnd}/COBRE_prep/{cnd}_Stats/{tr}'
        print(f'{cnd}_{tr}', end = ': ')
        try:
            sub_dirs = next(os.walk(path))[1]
        except StopIteration: 
            print("Directory doesn't exist")
            break
        for admin in sub_dirs: 
            print(f'{admin}')
            files = [x for x in os.listdir(os.path.join(path, admin)) if x.startswith('sub')]
            for f in files:
                subnum = re.findall('(?<=sub-A)([0-9]*)', f)[0]
                print(f'{subnum}', end = '')
                path2 = os.path.join(path, admin, f)
                dt = os.path.getmtime(path2)
                dt = datetime.fromtimestamp(dt).strftime('%Y-%m-%d %H:%M:%S')
                
                with open(path2, 'r') as fp: 
                    line = fp.readlines()
                    line = [re.sub('(?:(?<=FAT)|(?<=UF))_', '', x) for x in line]
                    info = extract_stats(line, group = {tr})
                    
                if info:
                    print('(logged)', end = ' ')
                    for side in info[tr].keys():
                        vals = []
                        for k in info[tr][side].keys():
                            vals.append(info[tr][side][k])
                        full_list = [subnum, admin, cnd, tr, dt, side]
                        

                        full_list.extend(vals)
                        i += 1

                        df.loc[i] = full_list
                else:
                    print('(no info)', end = ' ')
            print('\n')

SZ_FAT: HW
00024684(logged) 00038441(logged) 00035836(logged) 00020787(logged) 00027969(logged) 00037854(logged) 00024568(logged) 00024959(logged) 00023158(logged) 00038624(logged) 00037649(logged) 00038172(logged) 00023750(logged) 00027537(logged) 00028189(logged) 00024228(logged) 00028404(logged) 00027119(logged) 00035859(logged) 00022500(logged) 00024953(logged) 00011110(logged) 00024198(logged) 00035485(logged) 00023243(logged) 00037034(logged) 00020602(logged) 00027391(logged) 00037619(logged) 00027755(logged) 00037224(logged) 00023246(logged) 

IR
00011110(logged) 

SS
00018598(logged) 00017147(logged) 00000909(logged) 00001243(logged) 00020414(logged) 00016720(logged) 00018403(logged) 00014804(logged) 00016197(logged) 00015518(logged) 00011110(logged) 00001251(logged) 00014175(logged) 00018317(logged) 00001452(logged) 00000368(logged) 00009280(logged) 00015648(logged) 00020416(logged) 00016723(logged) 00014607(logged) 00000456(logged) 00000838(logged) 00014830(logged) 00014719(l

In [93]:
df

Unnamed: 0,subnum,admin,condition,tract,datetime_extracted,side,track_count,voxel_count,mean_length,se_length,...,se_angtrk,mean_AD,se_AD,mean_FA,se_FA,mean_MD,se_MD,mean_RD,se_RD,yr_scan
1,00024684,HW,SZ,FAT,2020-11-24 02:20:18,L,515,1259,70.6583,8.1768,...,4.0739,1.1927,0.1756,0.4766,0.1420,0.7591,0.0903,0.5423,0.1271,2010
2,00024684,HW,SZ,FAT,2020-11-24 02:20:18,R,69,516,73.4203,4.3298,...,4.8529,1.1268,0.1687,0.4158,0.1430,0.7658,0.0925,0.5853,0.1278,2010
3,00038441,HW,SZ,FAT,2020-12-22 15:45:40,L,29,242,74.0345,3.5705,...,3.8367,1.1101,0.1908,0.3919,0.1476,0.7632,0.0925,0.5897,0.1236,2013
4,00038441,HW,SZ,FAT,2020-12-22 15:45:40,R,269,860,69.9628,5.1310,...,3.6154,1.1182,0.1773,0.4062,0.1360,0.7613,0.0825,0.5829,0.1068,2013
5,00035836,HW,SZ,FAT,2020-12-22 15:25:51,L,0,0,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,00023848,SS,HC,FAT,2020-12-20 23:21:55,L,94,745,85.0000,5.5125,...,4.7988,1.1904,0.1875,0.4247,0.1373,0.8018,0.1223,0.6075,0.1481,2009
211,00023131,SS,HC,FAT,2020-12-21 00:00:10,R,9,139,75.0000,1.1180,...,4.6054,1.1546,0.1950,0.4036,0.1434,0.7862,0.0821,0.6021,0.1127,2011
212,00023131,SS,HC,FAT,2020-12-21 00:00:10,L,0,0,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2011
213,00024546,SS,HC,FAT,2020-12-21 00:14:30,L,312,919,68.3526,5.6929,...,4.7126,1.1341,0.1506,0.4394,0.1416,0.7528,0.0582,0.5621,0.1041,2009
