In [1]:
import os 
import sys
import re
import glob
import zipfile
from os.path import basename
from shutil import copy
from lxml import etree

In [2]:
os.getcwd()

'/home/sld0465/projects/ppmi/scripts/python/notebooks'

In [3]:
os.chdir("../../../data_collection/")

In [4]:
data_dir = "PPMI"
output_dir = "working/raw_zip"

In [5]:
os.makedirs(output_dir, exist_ok=True)

In [6]:
metadata_files = glob.glob(data_dir + "/PPMI_*.xml")
metadata_files[:5]

['PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372313.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372314.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_S103325_I226402.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_S103325_I226404.xml',
 'PPMI/PPMI_3105_Axial_PD-T2_TSE_S148998_I301557.xml']

In [7]:
# Pattern to extract information from metadata filename
p = re.compile("^" + data_dir + "/" + r"PPMI_([0-9]*)_(.*)_(S[0-9]*)_(I[0-9]*).xml")

In [8]:
# Split the metadata filenames into lists which have information extracted out from the regex
metadata_splits = [re.search(p,s).groups() for s in metadata_files]

In [9]:
metadata_dicts = [dict(zip(["subject_id", "image_type", "series_id", "image_id"], m)) for m in metadata_splits]

In [10]:
metadata_dicts[:5]

[{'image_id': 'I372313',
  'image_type': 'Axial_PD-T2_TSE_FS',
  'series_id': 'S189363',
  'subject_id': '3105'},
 {'image_id': 'I372314',
  'image_type': 'Axial_PD-T2_TSE_FS',
  'series_id': 'S189363',
  'subject_id': '3105'},
 {'image_id': 'I226402',
  'image_type': 'Axial_PD-T2_TSE',
  'series_id': 'S103325',
  'subject_id': '3105'},
 {'image_id': 'I226404',
  'image_type': 'Axial_PD-T2_TSE',
  'series_id': 'S103325',
  'subject_id': '3105'},
 {'image_id': 'I301557',
  'image_type': 'Axial_PD-T2_TSE',
  'series_id': 'S148998',
  'subject_id': '3105'}]

In [11]:
metadata_subject_sorted_zip = sorted(zip(metadata_files, metadata_dicts), key=lambda pair: pair[1]["subject_id"])

In [12]:
# The first element in this zip of two lists is the metadata_filename
# The second element is the information that has been extracted from the filename
metadata_subject_sorted_zip[:5]

[('PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372313.xml',
  {'image_id': 'I372313',
   'image_type': 'Axial_PD-T2_TSE_FS',
   'series_id': 'S189363',
   'subject_id': '3105'}),
 ('PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372314.xml',
  {'image_id': 'I372314',
   'image_type': 'Axial_PD-T2_TSE_FS',
   'series_id': 'S189363',
   'subject_id': '3105'}),
 ('PPMI/PPMI_3105_Axial_PD-T2_TSE_S103325_I226402.xml',
  {'image_id': 'I226402',
   'image_type': 'Axial_PD-T2_TSE',
   'series_id': 'S103325',
   'subject_id': '3105'}),
 ('PPMI/PPMI_3105_Axial_PD-T2_TSE_S103325_I226404.xml',
  {'image_id': 'I226404',
   'image_type': 'Axial_PD-T2_TSE',
   'series_id': 'S103325',
   'subject_id': '3105'}),
 ('PPMI/PPMI_3105_Axial_PD-T2_TSE_S148998_I301557.xml',
  {'image_id': 'I301557',
   'image_type': 'Axial_PD-T2_TSE',
   'series_id': 'S148998',
   'subject_id': '3105'})]

In [151]:
# find out where the dcm files could be based on the metadata filename
def find_dir_from_mdict(m):
    return(data_dir + "/" + m["subject_id"] + "/" + m["image_type"])

def build_output_dir(m, visit_identifier, mri_type, increment=1):
    return (output_dir + "/" + m["subject_id"] + "/" + visit_identifier \
            + "/" + mri_type + "_" + str(increment))

for mfile, md in metadata_subject_sorted_zip:
    mdir = find_dir_from_mdict(md)
    # We do not know the exact directory location as we do not know what time and date
    # this folder is named with. So we use the pattern /*/ to match it
    match_str = mdir + "/*/%(series_id)s/PPMI_%(subject_id)s_*.dcm" % md
    matched_dcms = glob.glob(match_str)

    metaroot = etree.parse(mfile)
    visit_identifier = metaroot.findtext('.//visitIdentifier')
    date_acquired = metaroot.findtext(".//dateAcquired")
    weighting = metaroot.findtext(".//protocolTerm/protocol[@term = 'Weighting']")
    grad_directions = metaroot.findtext(".//protocolTerm/protocol[@term = 'Gradient Directions']")

    mri_type = weighting
    if weighting is None and grad_directions is not None:
        mri_type = "DTI"
    # if mri_type is still none that means that both weighting and grad_directions were none
    
    print("Mfile  : %s" % mfile)
    print("Weight : %s" % weighting)
    print("GRAD   : %s" % grad_directions)
    print("MR_type: %s" % mri_type)
    print("MatchSt: %s" % match_str)    
    print("MatchEx: %s" % matched_dcms[0])
    

    for increment in range(1,16):
        output_subject_dir = build_output_dir(md, visit_identifier, mri_type, increment)
        if not os.path.exists(output_subject_dir):
            break
    # We haven't been able to find an empty directory
    if os.path.exists(output_subject_dir):
        output_subject_dir = None
        print ("ERROR: Cannot find an empty output subject directory")
        break
    
    os.makedirs(output_subject_dir)
    print("Output Subject Dir : ", output_subject_dir)

    print("Writing out zip file...")
    
    # Write out the raw dcm files
    xarfile = zipfile.ZipFile(output_subject_dir + "/" + "dcm.zip", mode="w", compression=zipfile.ZIP_DEFLATED)
    for filename in matched_dcms:
        xarfile.write(filename, basename(filename))
    xarfile.close()

    print("Copying Metadata...")
    
    copy(mfile, output_subject_dir + "/" + basename(mfile))
    print ("Done")

Mfile  : PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372313.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3105/Axial_PD-T2_TSE_FS/*/S189363/PPMI_3105_*.dcm
MatchEx: PPMI/3105/Axial_PD-T2_TSE_FS/2013-04-18_12_01_06.0/S189363/PPMI_3105_MR_Axial_PD-T2_TSE_FS_br_raw_20130515091800455_43_S189363_I372313.dcm
Output Subject Dir :  output/3105/Month 24/PD_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3105_Axial_PD-T2_TSE_FS_S189363_I372314.xml
Weight : T2
GRAD   : None
MR_type: T2
MatchSt: PPMI/3105/Axial_PD-T2_TSE_FS/*/S189363/PPMI_3105_*.dcm
MatchEx: PPMI/3105/Axial_PD-T2_TSE_FS/2013-04-18_12_01_06.0/S189363/PPMI_3105_MR_Axial_PD-T2_TSE_FS_br_raw_20130515091800455_43_S189363_I372313.dcm
Output Subject Dir :  output/3105/Month 24/T2_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3105_Axial_PD-T2_TSE_S103325_I226402.xml
Weight : T2
GRAD   : None
MR_type: T2
MatchSt: PPMI/3105/Axial_PD-T2_TSE/*/S103325/PPMI_3105_*.dcm
MatchEx: PPMI/3105/Ax

Copying Metadata...
Done
Mfile  : PPMI/PPMI_3107_Axial_PD-T2_TSE_S107255_I232144.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3107/Axial_PD-T2_TSE/*/S107255/PPMI_3107_*.dcm
MatchEx: PPMI/3107/Axial_PD-T2_TSE/2011-04-13_11_25_02.0/S107255/PPMI_3107_MR_Axial_PD-T2_TSE__br_raw_20110502095453341_17_S107255_I232144.dcm
Output Subject Dir :  output/3107/Baseline/PD_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3107_Axial_PD-T2_TSE_S107255_I232145.xml
Weight : T2
GRAD   : None
MR_type: T2
MatchSt: PPMI/3107/Axial_PD-T2_TSE/*/S107255/PPMI_3107_*.dcm
MatchEx: PPMI/3107/Axial_PD-T2_TSE/2011-04-13_11_25_02.0/S107255/PPMI_3107_MR_Axial_PD-T2_TSE__br_raw_20110502095453341_17_S107255_I232144.dcm
Output Subject Dir :  output/3107/Baseline/T2_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3107_Axial_PD-T2_TSE_S146598_I296430.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3107/Axial_PD-T2_TSE/*/S146598/PPMI_3107_*.dcm
MatchEx: PPMI/3107

Copying Metadata...
Done
Mfile  : PPMI/PPMI_3108_Axial_PD-T2_TSE_FS_S264298_I498884.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3108/Axial_PD-T2_TSE_FS/*/S264298/PPMI_3108_*.dcm
MatchEx: PPMI/3108/Axial_PD-T2_TSE_FS/2015-05-06_09_33_18.0/S264298/PPMI_3108_MR_Axial_PD-T2_TSE_FS_br_raw_20150629092841276_57_S264298_I498882.dcm
Output Subject Dir :  output/3108/Month 48/PD_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3108_Axial_PD-T2_TSE_S107266_I232162.xml
Weight : PD
GRAD   : None
MR_type: PD
MatchSt: PPMI/3108/Axial_PD-T2_TSE/*/S107266/PPMI_3108_*.dcm
MatchEx: PPMI/3108/Axial_PD-T2_TSE/2011-04-20_10_07_57.0/S107266/PPMI_3108_MR_Axial_PD-T2_TSE__br_raw_20110502103124444_48_S107266_I232162.dcm
Output Subject Dir :  output/3108/Baseline/PD_1
Writing out zip file...
Copying Metadata...
Done
Mfile  : PPMI/PPMI_3108_Axial_PD-T2_TSE_S107266_I232164.xml
Weight : T2
GRAD   : None
MR_type: T2
MatchSt: PPMI/3108/Axial_PD-T2_TSE/*/S107266/PPMI_3108_*.dcm
MatchEx

KeyboardInterrupt: 