# Data Merge
This notebook describes steps to pull the stats from the txt files and combine them to create the final data structure.

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os
import re
from collections import OrderedDict


### Location Path
Define the location of files here.

In [19]:
DATA_PATH = '/Users/stereopickles/CU_Google_Drive/COLUMBIA/DSI-Schizo/Data2020/DATA040320'
HC_PATH = f"{DATA_PATH}/HC/COBRE_prep/HC_Stats"
SZ_PATH = f"{DATA_PATH}/SZ/COBRE_prep/SZ_Stats"
HC_FAT_DTI_PATH = f'{HC_PATH}/FAT'
SZ_FAT_DTI_PATH = f'{SZ_PATH}/FAT'
HC_UF_DTI_PATH = f'{HC_PATH}/UF'
SZ_UF_DTI_PATH = f'{SZ_PATH}/UF'

In [31]:
TRACTS = ['FAT', 'UF']
CONDS = ['SZ', 'HC']

### Extract stats 
The resulting table should look like 
| subnum | admin | condition | tract | datetime | side | track_count | voxel_count | mean_length | mean_FA_trk | mean_Ang | mean_FA | mean_AD | mean_MD | mean_RD | 

In [56]:
df = pd.DataFrame(
    columns = ['subnum',
               'admin',
               'condition',
               'tract',
               'datetime_extracted',
               'yr_scanned',
               'side',
               'track_count',
               'voxel_count',
               'mean_length', 'se_length',
               'mean_FA_trk', 'se_FA_trk',
               'mean_Ang', 'se_Ang_trk', 
               'mean_FA', 'se_FA_trk',
               'mean_AD', 'se_AD_trk',
               'mean_MD', 'se_MD_trk', 
               'mean_RD', 'se_RD_trk'])

In [120]:
def find_num(str_):
    return int(re.findall('(?<=: )([.0-9]*)', str_)[0])

def find_mean_se(str_):
    mean = float(re.findall('(?<=: )([.0-9]*)', str_)[0])
    se = float(re.findall('(?<=- )([.0-9]*)', str_)[0])
    return mean, se

def find_type(str_):
    return re.findall('(?<=DTI_)([A-Z]*)', str_)[0]

def find_year(str_):
    return int(re.findall('(?<=ses-)([0-9]*)', str_)[0][:4])

In [121]:
def extract_stats(list_, group = ['FAT', 'UF']):
    ''' From a list of lines extract information '''
    result = {}
    for i in range(0, len(list_), 12):
        grp = re.findall('(?<=: )(.*)(?=\\n)', list_[i])[0]
        trk_group = grp[:3]
        side = grp[-1]
        if trk_group in group:
            trk_cnt = find_num(list_[i+1])
            vxl_cnt = find_num(list_[i+2])
            mean_length, se_length = find_mean_se(list_[i+4])
            mean_fatrk, se_fatrk = find_mean_se(list_[i+5])
            mean_angtrk, se_angtrk = find_mean_se(list_[i+6])
            yr_scan = find_year(list_[i+7])
            stats = {}
            for j in range(7, 11):
                stats[find_type(list_[i+j])] = find_mean_se(list_[i+j])
            if len(stats) != 4: 
                print('not enough stats')
            info = {'track_count': trk_cnt, 
                    'voxel_count': vxl_cnt, 
                    'mean_length': mean_length, 
                    'se_length': se_length, 
                    'mean_fatrk': mean_fatrk, 
                    'se_fatrk': se_fatrk, 
                    'mean_angtrk': mean_angtrk, 
                    'se_angtrk': se_angtrk, 
                    'AD': stats['AD'], 
                    'FA': stats['FA'], 
                    'MD': stats['MD'], 
                    'RD': stats['RD'], 
                    'yr_scan': yr_scan
                   }
            if trk_group in result:
                result[trk_group][side] = info
            else:
                result[trk_group] = {side: info}
        else:
            continue
    return result

In [124]:
extract_stats(line)

{'FAT': {'L': {'track_count': 16,
   'voxel_count': 206,
   'mean_length': 74.9375,
   'se_length': 1.8062,
   'mean_fatrk': 0.4408,
   'se_fatrk': 0.153,
   'mean_angtrk': 6.1663,
   'se_angtrk': 5.0377,
   'AD': (1.2106, 0.1891),
   'FA': (0.4689, 0.1567),
   'MD': (0.7816, 0.0963),
   'RD': (0.5671, 0.1429),
   'yr_scan': 2009},
  'R': {'track_count': 45,
   'voxel_count': 396,
   'mean_length': 66.0222,
   'se_length': 5.891,
   'mean_fatrk': 0.4276,
   'se_fatrk': 0.1356,
   'mean_angtrk': 7.142,
   'se_angtrk': 5.3548,
   'AD': (1.1318, 0.1711),
   'FA': (0.4615, 0.1411),
   'MD': (0.7467, 0.132),
   'RD': (0.5541, 0.1606),
   'yr_scan': 2009}}}

In [133]:
i = 0
for tr in TRACTS:
    for cnd in CONDS: 
        path = f'{DATA_PATH}/{cnd}/COBRE_prep/{cnd}_Stats/{tr}'
        sub_dirs = next(os.walk(path))[1]
        for admin in sub_dirs: 
            
            files = [x for x in os.listdir(os.path.join(path, admin)) if x.startswith('sub')]
            for f in files:
                subnum = re.findall('(?<=sub-A)([0-9]*)', f)[0]
                path2 = os.path.join(path, admin, f)
                dt = os.path.getmtime(path2)
                dt = datetime.fromtimestamp(dt).strftime('%Y-%m-%d %H:%M:%S')
                
                with open(path2, 'r') as fp: 
                    line = fp.readlines()
                    info = extract_stats(line, group = [tr])
                vals = []
                for side in info[tr].keys():
                    for k in info[tr][side].keys():
                        vals.append(info[tr][side][k])
                    full_list = [subnum, admin, cnd, tr, dt, side]
                    full_list.extend(vals)
                    print(full_list)
                    i += 1

[['00024684'], 'HW', 'SZ', 'FAT', '2020-11-24 02:20:18', 'L', 515, 1259, 70.6583, 8.1768, 0.4511, 0.1347, 6.1912, 4.0739, (1.1927, 0.1756), (0.4766, 0.142), (0.7591, 0.0903), (0.5423, 0.1271), 2010]
[['00024684'], 'HW', 'SZ', 'FAT', '2020-11-24 02:20:18', 'R', 515, 1259, 70.6583, 8.1768, 0.4511, 0.1347, 6.1912, 4.0739, (1.1927, 0.1756), (0.4766, 0.142), (0.7591, 0.0903), (0.5423, 0.1271), 2010, 69, 516, 73.4203, 4.3298, 0.3897, 0.1346, 6.9242, 4.8529, (1.1268, 0.1687), (0.4158, 0.143), (0.7658, 0.0925), (0.5853, 0.1278), 2010]
[['00038441'], 'HW', 'SZ', 'FAT', '2020-12-22 15:45:40', 'L', 29, 242, 74.0345, 3.5705, 0.3671, 0.1393, 5.6721, 3.8367, (1.1101, 0.1908), (0.3919, 0.1476), (0.7632, 0.0925), (0.5897, 0.1236), 2013]
[['00038441'], 'HW', 'SZ', 'FAT', '2020-12-22 15:45:40', 'R', 29, 242, 74.0345, 3.5705, 0.3671, 0.1393, 5.6721, 3.8367, (1.1101, 0.1908), (0.3919, 0.1476), (0.7632, 0.0925), (0.5897, 0.1236), 2013, 269, 860, 69.9628, 5.131, 0.3814, 0.1282, 5.264, 3.6154, (1.1182, 0.177

ValueError: could not convert string to float: 

In [39]:
from datetime import datetime


'2020-05-17 21:26:16'

In [38]:
tmp = os.path.getmtime('')
datetime.fromtimestamp(tmp).strftime('%Y-%m-%d %H:%M:%S')