In [89]:
import os 
import sys
import re
import glob
import zipfile
from os.path import basename
from shutil import copy
from lxml import etree

In [34]:
# FIXME: the path needs to be set relative to the execution directory
os.chdir(os.path.expanduser("~") + "/projects/ppmi/data_collection")

os.getcwd()

In [11]:
data_dir = "PPMI"
output_dir = "output"

In [36]:
metadata_files = glob.glob(data_dir + "/PPMI_*.xml")
metadata_files[:5]

['PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372313.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372314.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_S103325_I226402.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_S103325_I226404.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_S148998_I301557.xml']

In [40]:
# Pattern to extract information from metadata filename
p = re.compile("^" + data_dir + "/" + r"PPMI_([0-9]*)_(.*)_(S[0-9]*)_(I[0-9]*).xml")

In [41]:
# Split the metadata filenames into lists which have information extracted out from the regex
metadata_splits = [re.search(p,s).groups() for s in metadata_files]

In [96]:
mdict_list = [dict(zip(["subject_id", "image_type", "series_id", "image_id"], m)) for m in metadata_splits]

mdict = dict(zip(metadata_files, mdict_list))

In [145]:
mdict_except_3105 = {k:v for k, v in mdict.items() if v["subject_id"] != "3105"}

In [None]:
# find out where the dcm files could be based on the metadata filename
def find_dir_from_mdict(m):
    return(data_dir + "/" + m["subject_id"] + "/" + m["image_type"])

def build_output_dir(m, visit_identifier, mri_type, increment=1):
    return (output_dir + "/" + m["subject_id"] + "/" + visit_identifier \
            + "/" + mri_type + "_" + str(increment))

for mfile, md in mdict_except_3105.items():
    mdir = find_dir_from_mdict(md)
    # We do not know the exact directory location as we do not know what time and date
    # this folder is named with. So we use the pattern /*/ to match it
    match_str = mdir + "/*/%(series_id)s/PPMI_%(subject_id)s_*.dcm" % md
    matched_dcms = glob.glob(match_str)

    metaroot = etree.parse(mfile)
    visit_identifier = metaroot.findtext('.//visitIdentifier')
    date_acquired = metaroot.findtext(".//dateAcquired")
    weighting = metaroot.findtext(".//protocolTerm/protocol[@term = 'Weighting']")
    grad_directions = metaroot.findtext(".//protocolTerm/protocol[@term = 'Gradient Directions']")

    mri_type = weighting
    if weighting is None and grad_directions is not None:
        mri_type = "DTI"
    # if mri_type is still none that means that both weighting and grad_directions were none
    
    print("Mfile  : %s" % mfile)
    print("Weight : %s" % weighting)
    print("GRAD   : %s" % grad_directions)
    print("MR_type: %s" % mri_type)
    print("MatchSt: %s" % match_str)    
    print("MatchEx: %s" % matched_dcms[0])
    

    for increment in range(1,8):
        output_subject_dir = build_output_dir(md, visit_identifier, mri_type, increment)
        if not os.path.exists(output_subject_dir):
            break
    # We haven't been able to find an empty directory
    if os.path.exists(output_subject_dir):
        output_subject_dir = None
        print ("ERROR: Cannot find an empty output subject directory")
        break
    
    os.makedirs(output_subject_dir)
    print("Output Subject Dir : ", output_subject_dir)

    print("Writing out zip file...")
    
    # Write out the raw dcm files
    xarfile = zipfile.ZipFile(output_subject_dir + "/" + "dcm.zip", mode="w", compression=zipfile.ZIP_DEFLATED)
    for filename in matched_dcms:
        xarfile.write(filename, basename(filename))
    xarfile.close()

    print("Copying Metadata...")
    
    copy(mfile, output_subject_dir + "/" + basename(mfile))
    print ("Done")

Mfile  : PPMI/PPMI_3107_Axial_PD-T2_TSE_FS_S193428_I378214.xml
Weight : T2
GRAD   : None
MR_type: T2
MatchSt: PPMI/3107/Axial_PD-T2_TSE_FS/*/S193428/PPMI_3107_*.dcm
MatchEx: PPMI/3107/Axial_PD-T2_TSE_FS/2013-05-15_10_04_16.0/S193428/PPMI_3107_MR_Axial_PD-T2_TSE_FS_br_raw_20130626132622097_99_S193428_I378214.dcm
Output Subject Dir :  output/3107/Month 24/T2_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3107_Axial_PD-T2_TSE_FS_S193428_I378216.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3107/Axial_PD-T2_TSE_FS/*/S193428/PPMI_3107_*.dcm
MatchEx: PPMI/3107/Axial_PD-T2_TSE_FS/2013-05-15_10_04_16.0/S193428/PPMI_3107_MR_Axial_PD-T2_TSE_FS_br_raw_20130626132622097_99_S193428_I378214.dcm
Output Subject Dir :  output/3107/Month 24/PD_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3107_Axial_PD-T2_TSE_FS_S264295_I498878.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3107/Axial_PD-T2_TSE_FS/*/S264295/PPMI_3107_*.dcm
MatchEx: PPMI/3

Copying Metadata...
Done
Mfile  : PPMI/PPMI_3107_Sag_MPRAGE_GRAPPA_S264293_I498876.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3107/Sag_MPRAGE_GRAPPA/*/S264293/PPMI_3107_*.dcm
MatchEx: PPMI/3107/Sag_MPRAGE_GRAPPA/2015-05-08_09_33_05.0/S264293/PPMI_3107_MR_Sag_MPRAGE_GRAPPA__br_raw_20150629091740748_1_S264293_I498876.dcm
Output Subject Dir :  output/3107/Month 48/PD_2
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3108_Axial_PD-T2_TSE_FS_S193434_I378221.xml
Weight : T2
GRAD   : None
MR_type: T2
MatchSt: PPMI/3108/Axial_PD-T2_TSE_FS/*/S193434/PPMI_3108_*.dcm
MatchEx: PPMI/3108/Axial_PD-T2_TSE_FS/2013-04-24_10_04_37.0/S193434/PPMI_3108_MR_Axial_PD-T2_TSE_FS_br_raw_20130626150555643_96_S193434_I378221.dcm
Output Subject Dir :  output/3108/Month 24/T2_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3108_Axial_PD-T2_TSE_FS_S193434_I378224.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3108/Axial_PD-T2_TSE_FS/*/S193434/PPMI_3108_

Copying Metadata...
Done
Mfile  : PPMI/PPMI_3108_Sag_MPRAGE_GRAPPA_S264300_I498885.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3108/Sag_MPRAGE_GRAPPA/*/S264300/PPMI_3108_*.dcm
MatchEx: PPMI/3108/Sag_MPRAGE_GRAPPA/2015-05-06_09_33_18.0/S264300/PPMI_3108_MR_Sag_MPRAGE_GRAPPA__br_raw_20150629092807166_70_S264300_I498885.dcm
Output Subject Dir :  output/3108/Month 48/PD_2
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3116_Axial_PD-T2_TSE_FS_S186401_I366128.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3116/Axial_PD-T2_TSE_FS/*/S186401/PPMI_3116_*.dcm
MatchEx: PPMI/3116/Axial_PD-T2_TSE_FS/2012-11-14_11_13_50.0/S186401/PPMI_3116_MR_Axial_PD-T2_TSE_FS_br_raw_20130409152519069_35_S186401_I366128.dcm
Output Subject Dir :  output/3116/Month 12/PD_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3116_Axial_PD-T2_TSE_FS_S186401_I366129.xml
Weight : T2
GRAD   : None
MR_type: T2
MatchSt: PPMI/3116/Axial_PD-T2_TSE_FS/*/S186401/PPMI_3116