# Making a script to organize high-throughput imaging data

- Generates folders for each drug and dose
- Rename .nd2 files to remove spaces and commas
- Moves the .nd2 files into organized drug, dose folders

By Prech Uapinyoying

12/20/2018

In [1]:
import shutil
import os # for dealing with operating system commands like paths to files
#import sys
import re
import itertools

# Step 1. Generate test files

Before you even start, you need to create some test files to play with. Use the included 'create_orig_files.sh'

Then run the script in the directory you want by doing `bash create_orig_files.sh`

---

# Step2.  Get the current directory path and a list of all the files in it

In [2]:
# Get the current working directory
cwd = os.getcwd()
cwd

'/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2'

In [3]:
# I generated the test files into a folder called 'test_files' lets create that path
# join command will intellegently combine path names with correct slash for any operating system
filesDir = os.path.join(cwd, 'test_files') 
filesDir

'/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files'

In [4]:
# Get a list of files in that test_files directory
fileList = os.listdir(filesDir)
fileList[0:5] # take a look at first 5

['grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2',
 'grn_d6_24hr_posttx_WellF03_Channel405 nm,488 nm,561 nm,640 nm_Seq0141.nd2',
 'grn_d6_24hr_posttx_WellA22_Channel405 nm,488 nm,561 nm,640 nm_Seq0021.nd2',
 'grn_d6_24hr_posttx_WellJ20_Channel405 nm,488 nm,561 nm,640 nm_Seq0220.nd2',
 'grn_d6_24hr_posttx_WellI01_Channel405 nm,488 nm,561 nm,640 nm_Seq0192.nd2']

In [5]:
# We need to check if the filename also ends with an nd2, can do this with os.path.splitext
filename, file_extension = os.path.splitext(
    'grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2')

print(f'The extension is: "{file_extension}"')

The extension is: ".nd2"


In [6]:
# Make it into a function, you will probably use the script in the folder anyways

def get_nd2_fileList(filesDir):
    
    fileList = []
    
    cwd = os.getcwd()
    allFiles = os.listdir(filesDir)
    for file in allFiles:
        filename, file_extension = os.path.splitext(file)
        if file_extension == '.nd2':
            fileList.append(file)
    
    return fileList

---


# Step 3. Automating the generation of new drug/dose directories

In [7]:
#  lets make 8 drug directories
drugDirNames = ['drug1','drug2','drug3','drug4','drug5','drug6',
                'drug7','drug8','drug9','drug10','drug11', 'drug12']
drugDirNames

['drug1',
 'drug2',
 'drug3',
 'drug4',
 'drug5',
 'drug6',
 'drug7',
 'drug8',
 'drug9',
 'drug10',
 'drug11',
 'drug12']

In [8]:
doseDirNames = ['dose1','dose2','dose3','dose4','dose5','dose6','dose7','dose8']
doseDirNames

['dose1', 'dose2', 'dose3', 'dose4', 'dose5', 'dose6', 'dose7', 'dose8']

In [9]:
# Lets add the full path to the first drug folder and drug dose
os.path.join(filesDir, drugDirNames[0], doseDirNames[0])

'/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose1'

In [10]:
# Ok, now lets make a for loop that will generate all these paths for us and put it into a list

fullDrugDirPaths = [] # make an empty list to put the generated directory paths into

# For each drug directory in the list, join the paths and add it to the empty list
for drugDir in drugDirNames:
    for doseDir in doseDirNames:
        currDir = os.path.join(filesDir, drugDir, doseDir)
        fullDrugDirPaths.append(currDir)
    
fullDrugDirPaths[0:16]

['/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose1',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose2',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose3',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose4',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose5',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose6',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose7',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug1/dose8',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug2/dose1',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug2/dose2',
 '/Users/uapinyoyingpb/misc-scripts/python_scripts/organize_nd2/test_files/drug2/dose3',
 '/Users/uapinyoyingp

In [11]:
# make this into a function

def gen_dir_paths(drugDirNames, drugDoseNames, filesDir):
    fullDrugDirPaths = [] # make an empty list to put the generated directory paths into

    # For each drug directory in the list, join the paths and add it to the empty list
    for drugDir in drugDirNames:
        for doseDir in doseDirNames:
            currDir = os.path.join(filesDir, drugDir, doseDir)
            fullDrugDirPaths.append(currDir)
            
    return fullDrugDirPaths

In [12]:
# Lets make a new directory, we only want to make the new directory if it doesn't exist yet
# again, lets try it with the first folder only
# if not os.path.exists(fullDrugDirPaths[1]):
#     os.makedirs(fullDrugDirPaths[1])
#     print('I just created this folder: {}'.format(fullDrugDirPaths[1]))
# else:
#     print('Folder already exists, man!')
    

In [13]:
def make_dirs(fullDrugDirPaths):
    for path in fullDrugDirPaths:
        try:
            # chng to 'exist_ok=False' if you don't want to allow overwriting
            os.makedirs(path, exist_ok=True) 
            #print(f'Created directory: {path}')
        except FileExistsError:
            print(f'Error: {path} exists dude. Have you already run this script before?')

In [14]:
# Now actually make all the directories
make_dirs(fullDrugDirPaths)

In [167]:
# Check if the folders are there
next(os.walk(filesDir))[1]

['drug12',
 'drug8',
 'drug6',
 'drug1',
 'drug7',
 'drug9',
 'drug11',
 'drug10',
 'drug2',
 'drug5',
 'drug4',
 'drug3']

---

# Step 4. Figure out how to rename files to remove spaces and commas 

First, lets try to use regular expressions to determine what well each file is from based on its name

In [168]:
REGEXP = r'(\w+)_Well(\w+)_Channel(\d+) nm,(\d+) nm,(\d+) nm,(\d+) nm_(\w+).(nd2)'

test = 'grn_d6_24hr_posttx_WellA01_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2'

r = re.search(REGEXP,test)

# This (f'{}') is a special technique for building strings called 'f-string interpolation'
# Look it up, its super useful
replacement = f'{r.group(1)}_Well{r.group(2)}_Channel_{r.group(3)}nm_\
{r.group(4)}nm_{r.group(5)}nm_{r.group(6)}nm_{r.group(7)}.{r.group(8)}'

print(test) # The original file name in our test case (spaces, commas and all)

grn_d6_24hr_posttx_WellA01_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2


In [169]:
replacement # Check the replacement string if it removed all our spaces and commas

'grn_d6_24hr_posttx_WellA01_Channel_405nm_488nm_561nm_640nm_Seq0024.nd2'

In [170]:
# Quick test if renaming works
#shutil.move(os.path.join(filesDir,test),os.path.join(filesDir,replacement))

In [171]:
# renaming all the files using a for loop. If you do this you will need to recreate the files using the
# bash script 'create_orig_files.sh'

# for file in fileList:
#     r = re.search(REGEXP,file)
#     replacement = f'{r.group(1)}_Well{r.group(2)}_Channel_{r.group(3)}nm_\
# {r.group(4)}nm_{r.group(5)}nm_{r.group(6)}nm_{r.group(7)}.{r.group(8)}'
    
#     sourcePath = os.path.join(filesDir,file)
#     replacementPath = os.path.join(filesDir, replacement)
    
#     shutil.move(sourcePath,replacementPath)

Lets reserve the actual renaming for later down the line. We will try to combine renaming and moving in a single step, but first, create the name changing portion into a function that we can reuse it later

In [173]:
def get_new_filenames(fileList):
    # file names in a list of tuples (old name, new name)
    
    oldNewFilenameList = []
    
    for oldFilename in fileList:
        r = re.search(REGEXP,oldFilename)
        newFilename = f'{r.group(1)}_Well{r.group(2)}_Channel_{r.group(3)}nm_{r.group(4)}nm_{r.group(5)}nm_{r.group(6)}nm_{r.group(7)}.{r.group(8)}'
        oldNewFilenameList.append((oldFilename, newFilename))

    return oldNewFilenameList


In [199]:
# Lets get the old and new file name list

oldNewFilenameList = get_new_filenames(fileList)

# for (oldName, newName) in oldNewFilenameList:
#     print(f'old name: {oldName}')
#     print(f'new name: {newName}\n')

---


# Step 5. Matching specific files to specific directories.

Now we need to figure out how to match up the file with its destination if we want to move the correct files to their new directories. We can do this by using the captured well name within the file.

In [175]:
# To arrange the wells for each drug, we will make a dictionary for each drug with the dose as the key 
# and list of wells as the values

drug1Dict = {'dose1': ['A01','A03','A05', 'A07'],
             'dose2': ['C01','C03','C05', 'C07'],
             'dose3': ['E01','E03','E05', 'E07'],
             'dose4': ['G01','G03','G05', 'G07'],
             'dose5': ['I01','I03','I05', 'I07'],
             'dose6': ['K01','K03','K05', 'K07'],
             'dose7': ['M01','M03','M05', 'M07'],
             'dose8': ['O01','O03','O05', 'O07']}

# We can use itertools' .items() to loop through key (dose), value (wells) pairs
for dose, wells in drug1Dict.items():
    print(dose,wells)

dose1 ['A01', 'A03', 'A05', 'A07']
dose2 ['C01', 'C03', 'C05', 'C07']
dose3 ['E01', 'E03', 'E05', 'E07']
dose4 ['G01', 'G03', 'G05', 'G07']
dose5 ['I01', 'I03', 'I05', 'I07']
dose6 ['K01', 'K03', 'K05', 'K07']
dose7 ['M01', 'M03', 'M05', 'M07']
dose8 ['O01', 'O03', 'O05', 'O07']


In [200]:
# Lets try to figure out the logic for determining what file goes to what folder

# for file in fileList:
#     r = re.search(REGEXP,file)
#     for dose, wells in drug1Dict.items():
#         if r.group(2) in wells:
#             print(file,'- belongs to --> ', dose)

In [177]:
# Lets go a bit further and make a nested dictionary of all the drugs
# { drug: {dose: wells} }


drugDict = {'drug1': 
                 {
                  'dose1': ['A01','A03','A05', 'A07'],
                  'dose2': ['C01','C03','C05', 'C07'],
                  'dose3': ['E01','E03','E05', 'E07'],
                  'dose4': ['G01','G03','G05', 'G07'],
                  'dose5': ['I01','I03','I05', 'I07'],
                  'dose6': ['K01','K03','K05', 'K07'],
                  'dose7': ['M01','M03','M05', 'M07'],
                  'dose8': ['O01','O03','O05', 'O07']
                 },
            
            'drug2':
                 {
                  'dose1': ['A02','A04','A06', 'A08'],
                  'dose2': ['C02','C04','C06', 'C08'],
                  'dose3': ['E02','E04','E06', 'E08'],
                  'dose4': ['G02','G04','G06', 'G08'],
                  'dose5': ['I02','I04','I06', 'I08'],
                  'dose6': ['K02','K04','K06', 'K08'],
                  'dose7': ['M02','M04','M06', 'M08'],
                  'dose8': ['O02','O04','O06', 'O08']
                 },
            
            'drug3':
                {
                  'dose1': ['A09','A11','A13', 'A15'],
                  'dose2': ['C09','C11','C13', 'C15'],
                  'dose3': ['E09','E11','E13', 'E15'],
                  'dose4': ['G09','G11','G13', 'G15'],
                  'dose5': ['I09','I11','I13', 'I15'],
                  'dose6': ['K09','K11','K13', 'K15'],
                  'dose7': ['M09','M11','M13', 'M15'],
                  'dose8': ['O09','O11','O13', 'O15']
                 },
            
            'drug4':
                {
                  'dose1': ['A10','A12','A14', 'A16'],
                  'dose2': ['C10','C12','C14', 'C16'],
                  'dose3': ['E10','E12','E14', 'E16'],
                  'dose4': ['G10','G12','G14', 'G16'],
                  'dose5': ['I10','I12','I14', 'I16'],
                  'dose6': ['K10','K12','K14', 'K16'],
                  'dose7': ['M10','M12','M14', 'M16'],
                  'dose8': ['O10','O12','O14', 'O16']
                 },
            
            'drug5':
                {
                  'dose1': ['A17','A19','A21', 'A23'],
                  'dose2': ['C17','C19','C21', 'C23'],
                  'dose3': ['E17','E19','E21', 'E23'],
                  'dose4': ['G17','G19','G21', 'G23'],
                  'dose5': ['I17','I19','I21', 'I23'],
                  'dose6': ['K17','K19','K21', 'K23'],
                  'dose7': ['M17','M19','M21', 'M23'],
                  'dose8': ['O17','O19','O21', 'O23']
                 },
            'drug6':
                {
                  'dose1': ['A18','A20','A22', 'A24'],
                  'dose2': ['C18','C20','C22', 'C24'],
                  'dose3': ['E18','E20','E22', 'E24'],
                  'dose4': ['G18','G20','G22', 'G24'],
                  'dose5': ['I18','I20','I22', 'I24'],
                  'dose6': ['K18','K20','K22', 'K24'],
                  'dose7': ['M18','M20','M22', 'M24'],
                  'dose8': ['O18','O20','O22', 'O24']
                 },
            'drug7': 
                 {
                  'dose1': ['B01','B03','B05', 'B07'],
                  'dose2': ['D01','D03','D05', 'D07'],
                  'dose3': ['F01','F03','F05', 'F07'],
                  'dose4': ['H01','H03','H05', 'H07'],
                  'dose5': ['J01','J03','J05', 'J07'],
                  'dose6': ['L01','L03','L05', 'L07'],
                  'dose7': ['N01','N03','N05', 'N07'],
                  'dose8': ['P01','P03','P05', 'P07']
                 },
            
            'drug8':
                 {
                  'dose1': ['B02','B04','B06', 'B08'],
                  'dose2': ['D02','D04','D06', 'D08'],
                  'dose3': ['F02','F04','F06', 'F08'],
                  'dose4': ['H02','H04','H06', 'H08'],
                  'dose5': ['J02','J04','J06', 'J08'],
                  'dose6': ['L02','L04','L06', 'L08'],
                  'dose7': ['N02','N04','N06', 'N08'],
                  'dose8': ['P02','P04','P06', 'P08']
                 },
            
            'drug9':
                {
                  'dose1': ['B09','B11','B13', 'B15'],
                  'dose2': ['D09','D11','D13', 'D15'],
                  'dose3': ['F09','F11','F13', 'F15'],
                  'dose4': ['H09','H11','H13', 'H15'],
                  'dose5': ['J09','J11','J13', 'J15'],
                  'dose6': ['L09','L11','L13', 'L15'],
                  'dose7': ['N09','N11','N13', 'N15'],
                  'dose8': ['P09','P11','P13', 'P15']
                 },
            
            'drug10':
                {
                  'dose1': ['B10','B12','B14', 'B16'],
                  'dose2': ['D10','D12','D14', 'D16'],
                  'dose3': ['F10','F12','F14', 'F16'],
                  'dose4': ['H10','H12','H14', 'H16'],
                  'dose5': ['J10','J12','J14', 'J16'],
                  'dose6': ['L10','L12','L14', 'L16'],
                  'dose7': ['N10','N12','N14', 'N16'],
                  'dose8': ['P10','P12','P14', 'P16']
                 },
            
            'drug11':
                {
                  'dose1': ['B17','B19','B21', 'B23'],
                  'dose2': ['D17','D19','D21', 'D23'],
                  'dose3': ['F17','F19','F21', 'F23'],
                  'dose4': ['H17','H19','H21', 'H23'],
                  'dose5': ['J17','J19','J21', 'J23'],
                  'dose6': ['L17','L19','L21', 'L23'],
                  'dose7': ['N17','N19','N21', 'N23'],
                  'dose8': ['P17','P19','P21', 'P23']
                 },
            'drug12':
                {
                  'dose1': ['B18','B20','B22', 'B24'],
                  'dose2': ['D18','D20','D22', 'D24'],
                  'dose3': ['F18','F20','F22', 'F24'],
                  'dose4': ['H18','H20','H22', 'H24'],
                  'dose5': ['J18','J20','J22', 'J24'],
                  'dose6': ['L18','L20','L22', 'L24'],
                  'dose7': ['N18','N20','N22', 'N24'],
                  'dose8': ['P18','P20','P22', 'P24']
                 }
           }
           

In [181]:
# Print out everything in order

# for drug, doseList in drugDict.items():
#         for dose, wells in doseList.items():
#             print(drug, dose, wells)

In [183]:
# Now try matching te file with the drug and dose using the new nested dictionary data structure
# for file in fileList:
#     r = re.search(REGEXP,file)
#     for drug, doseList in drugDict.items():
#         for dose, wells in doseList.items():
#             if r.group(2) in wells:
#                 print(file,'- belongs to --> ', r.group(2), drug, dose)

In [145]:
# Now build upon this more to create the file path of where the file should be moved to

# Recall from above that:
# filesDir = '/Users/uapinyoyingpb/sandbox/ward_test/test_files'

fileDestList = []

for file in fileList: # original file names in a list
    r = re.search(REGEXP,file)
    for drug, doseDict in drugDict.items():
        for dose, wells in doseDict.items():
            if r.group(2) in wells:
                destPath = os.path.join(filesDir, drug, dose) # destination path
                
                # I am going to pair the file and destination path into a tuple using the extra parentheses ()
                fileDestList.append((file, destPath))

# Uncomment to see the entire list
print(fileDestList[0])


# You can access them just like a nested list
# fileDestList[0][0] <-- item 1 in the pair

#print(fileDestList[0][0],'\n\n', fileDestList[0][1])
                

('grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2', '/Users/uapinyoyingpb/sandbox/ward_test/test_files/drug12/dose1')


In [146]:
# Lets create a function for it

def gen_fileDestList(filesDir, fileList, drugDict):
    fileDestList = []

    for file in fileList: # original file names in a list
        r = re.search(REGEXP,file)
        for drug, doseDict in drugDict.items():
            for dose, wells in doseDict.items():
                if r.group(2) in wells:
                    destPath = os.path.join(filesDir, drug, dose) # destination path

                    # I am going to pair the file and destination path into a tuple using the extra parentheses ()
                    fileDestList.append((file, destPath))
    
    return fileDestList

---

# Step 6. Putting it all together

Next we want to take this list of tuples that includes the file and destination path and use it to move and rename our files at the same time, by combining what we learned earlier

First thing I am going to try is take the two lists of information and pair them together to create the final list of source path + original file name and destination file path + renamed file

- `fileDestList` has orginal file name paired with the destination folder
- `oldNewFilenameList` has original filename as well paired with the new name

In [185]:
fileDestList[0]

('grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2',
 '/Users/uapinyoyingpb/sandbox/ward_test/test_files/drug12/dose1')

In [186]:
oldNewFilenameList[0]

('grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2',
 'grn_d6_24hr_posttx_WellB24_Channel_405nm_488nm_561nm_640nm_Seq0024.nd2')

In [187]:
# moveList = [] # a list of tuples with (originalFullFilePath, renamedNewFilePath)

# for (file, destPath) in fileDestList:
#     for (oldFile, newFile) in oldNewFilenameList:
#         if file == oldFile:
#             oldFilePath = os.path.join(filesDir, oldFile)
#             newFilePath = os.path.join(destPath, newFile)
            
#             moveList.append((oldFilePath, newFilePath))
            
# moveList[0] # peak at the first entry


('/Users/uapinyoyingpb/sandbox/ward_test/test_files/grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2',
 '/Users/uapinyoyingpb/sandbox/ward_test/test_files/drug12/dose1/grn_d6_24hr_posttx_WellB24_Channel_405nm_488nm_561nm_640nm_Seq0024.nd2')

In [188]:
# again, make it into a function

def gen_moveList(fileDestList, oldNewFilenameList):
    moveList = [] # a list of tuples with (originalFullFilePath, renamedNewFilePath)

    for (file, destPath) in fileDestList:
        for (oldFile, newFile) in oldNewFilenameList:
            if file == oldFile:
                oldFilePath = os.path.join(filesDir, oldFile)
                newFilePath = os.path.join(destPath, newFile)

                moveList.append((oldFilePath, newFilePath))

    return moveList

In [189]:
moveList = gen_moveList(fileDestList, oldNewFilenameList)
moveList[0]

('/Users/uapinyoyingpb/sandbox/ward_test/test_files/grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2',
 '/Users/uapinyoyingpb/sandbox/ward_test/test_files/drug12/dose1/grn_d6_24hr_posttx_WellB24_Channel_405nm_488nm_561nm_640nm_Seq0024.nd2')

In [190]:
# Finally all we have to do now is do the moving! I'll make this into a simple function

def move_files(moveList):
    for (source, destination) in moveList:
        shutil.move(source, destination)
        print(f'Moved: {source}\n\t--> {destination}')

In [191]:
move_files(moveList)

Moved: /Users/uapinyoyingpb/sandbox/ward_test/test_files/grn_d6_24hr_posttx_WellB24_Channel405 nm,488 nm,561 nm,640 nm_Seq0024.nd2
	--> /Users/uapinyoyingpb/sandbox/ward_test/test_files/drug12/dose1/grn_d6_24hr_posttx_WellB24_Channel_405nm_488nm_561nm_640nm_Seq0024.nd2
Moved: /Users/uapinyoyingpb/sandbox/ward_test/test_files/grn_d6_24hr_posttx_WellF03_Channel405 nm,488 nm,561 nm,640 nm_Seq0141.nd2
	--> /Users/uapinyoyingpb/sandbox/ward_test/test_files/drug7/dose3/grn_d6_24hr_posttx_WellF03_Channel_405nm_488nm_561nm_640nm_Seq0141.nd2
Moved: /Users/uapinyoyingpb/sandbox/ward_test/test_files/grn_d6_24hr_posttx_WellA22_Channel405 nm,488 nm,561 nm,640 nm_Seq0021.nd2
	--> /Users/uapinyoyingpb/sandbox/ward_test/test_files/drug6/dose1/grn_d6_24hr_posttx_WellA22_Channel_405nm_488nm_561nm_640nm_Seq0021.nd2
Moved: /Users/uapinyoyingpb/sandbox/ward_test/test_files/grn_d6_24hr_posttx_WellJ20_Channel405 nm,488 nm,561 nm,640 nm_Seq0220.nd2
	--> /Users/uapinyoyingpb/sandbox/ward_test/test_files/drug12