**Objective:** Load the excel file containing the TMA cores information and sort the patient, samples and annotations for renaming the QuPath's images export/output.

# 1.0 Get the excel file

In [499]:
# import dependencies
import pandas as pd
import os

In [500]:
# get the excel file
FILE_PATH = 'C:/Users/tariq/GDrive (m.rifqi901)/SCIN/PHD/tasks/tma_analysis/Consecutive Cohort Core Analysis _2.xlsx'
df = pd.read_excel(FILE_PATH, header=None)
#df = df.drop([11,12,13,14,15], axis=1)
df = df.loc[:,0:10]
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,TMA 1,,,kontroll,,kontroll,,,,,
1,,1,7,12,18,23,28,33,39,43,48
2,,NT,MC,MC,,,FC,FC,,,PC
3,,1,7,12,18,23,28,33,39,43,48
4,,MC,,MC,,,FC,,PC,,PC
...,...,...,...,...,...,...,...,...,...,...,...
228,,,FC,PC,,,,FC,,,MC
229,,lgl 457,463,468,475,482,487,493,499,505,512
230,,,,,PC,,PC,,,MC,
231,,lgl 457,463,468,475,482,487,493,499,505,512


In [501]:
# get index where to seperate the TMAs
tma_loc = df.index[df[0].str.contains("TMA", case=False)==True].tolist()

# create a list of TMAs with its index locations
tma_list = []
for i in tma_loc:
    x = df.iloc[i,0]
    x = x.replace(" ", "")
    x = [x, i]
    tma_list.append(x)
    #print(x)
tma_list

[['TMA1', 0],
 ['TMA2', 26],
 ['TMA3', 52],
 ['TMA4', 78],
 ['TMA5', 104],
 ['TMA6', 130],
 ['TMA7', 156],
 ['TMA8', 182],
 ['TMA9', 208]]

# 2.0 Load helper functions

Helper functions:

In [502]:
def CreateDfs(tma_loc=[], sort=True, save=True, *args):
    '''
    Functions:
    Read the excel files that contain the TMAs information
    and convert that into dataframes for processing.
    The output will be processed dataframe with annotated names
    and textfile that contains filenames for renamer function.
    
    Arguments:
    tma_loc (list) = expect TMA# and location index ie: ['TMA1', 0]
    sort (bool) = wheter to sort dataframe according to patients or positional index
    save (bool) = save dataframes as csv and filenames as .txt
    
    '''
    # select the arrays
    df_tma = df[tma_loc[1]:(tma_loc[1]+25)]
    df_tma = df_tma.iloc[:,1:11]
    
    # replace NaN with 0
    df_tma = df_tma.fillna(0)
    
    # add new row
    new_row = [0,0,0,0,0,0,0,0,0,0]
    new_row = pd.Series(new_row, index=df_tma.columns)
    df_tma = pd.concat([df_tma.iloc[:1], new_row.to_frame().T, df_tma.iloc[1:]], ignore_index = True)
    
    # select even rows (patients)
    df_tma_e = df_tma[::2]

    # select odd rows (annotations)
    df_tma_o = df_tma[1::2]

    ##_____stacking patients_____
    
    # create an empty df to list all
    df_list = pd.DataFrame()

    # concat each columns into one column
    for i in df_tma_e.columns.tolist():
        ii = i-1
        x = df_tma_e.iloc[:,ii]
        df_list = pd.concat([df_list, x], ignore_index=True)
        #print(f"stacking column: {ii}")

    # import for sort
    from natsort import index_natsorted
    import numpy as np

    # sort the list
    df_list.columns = ["patients"]
    df_list = df_list.astype(str)
    df_list.sort_values(by='patients', key=lambda x: np.argsort(index_natsorted(df_list["patients"])))
    df_list = df_list.reset_index(inplace=False)
    df_list = df_list.drop("index", axis=1)
    
    ##_____stacking samples_____
    
    # create an empty df to list all
    df_samples = pd.DataFrame()

    # concat each columns into one column
    for i in df_tma_o.columns.tolist():
        ii = i-1
        x = df_tma_o.iloc[:,ii]
        df_samples = pd.concat([df_samples, x], ignore_index=True)
        #print(f"stacking column: {ii}")

    # import for sort
    #from natsort import index_natsorted
    #import numpy as np

    # sort the list
    df_samples.columns = ["samples"]
    df_samples = df_samples.astype(str)
    df_samples.sort_values(by='samples', key=lambda x: np.argsort(index_natsorted(df_samples["samples"])))
    df_samples = df_samples.reset_index(inplace=False)
    df_samples = df_samples.drop("index", axis=1)
    
    df_list = pd.merge(df_list, df_samples,left_index=True, right_index=True)
    df_list.reset_index(inplace=True)
    
    ##_____annotating the data_____
    
    # count how many samples per patients
    df_uniq = df_list['patients'].value_counts()
    patients = df_uniq.index.tolist()
    samples = df_uniq.values.tolist()
    dict = {'patients':patients, 'samples':samples}
    df_uniq = pd.DataFrame(dict, columns=["patients", "samples"])
    
    # create annonated pat_sam (patients_samples)
    df_pat_sam = pd.DataFrame()
    for i in df_uniq.index.tolist():
        ii = df_uniq.iloc[i,0]
        iii = df_uniq.iloc[i,1]

        for x in range(iii):
            xx = x + 1
            xxx = [ii, str(ii) + "_" + str(xx)]
            xxx = pd.Series(xxx)
            #print(xxx)
            df_pat_sam = pd.concat([df_pat_sam, xxx], axis=1, ignore_index=True)

    df_pat_sam = df_pat_sam.T
    df_pat_sam = df_pat_sam.rename(columns = {0:'index', 1:'pat_sam'})
    
    # use natsorted for consistent sorting
    #from natsort import natsorted
    pat_sam = df_pat_sam.values.tolist()
    pat_sam = natsorted(pat_sam, key = lambda x: x[1])
    all = df_list.values.tolist()
    all = natsorted(all, key = lambda x: x[1])
    
    ##_____final annotations of the data_____
        
    df_all = pd.DataFrame()
    ii = -1
    annot = []

    for i in pat_sam:
        ii += 1
        x = [i, all[ii]]
        x = sum(x, [])
        #print(x[4])

        if x[4] == '0':
            annot = [str(tma_loc[0]) + '_' + str(x[1])]
            xx = [x, annot]
            xx = sum(xx, [])
            #print(xx)
        else:
            annot = [str(tma_loc[0]) + '_' + str(x[1]) + '_' + str(x[4])]
            xx = [x, annot]
            xx = sum(xx, [])
            #print(xx)

        xx = pd.Series(xx)
        df_all = pd.concat([df_all, xx], axis=1, ignore_index=True)

    df_all = df_all.T
    col_names = {
        0:'patients',
        1:'pat_sam',
        2:'positions',
        3:'patients II',
        4:'annotations',
        5:'annotated'
    }
    df_all = df_all.rename(columns = col_names)

    ##_____conditions of 'sort'_____
        
    if sort == True:
        df_all = df_all.sort_values(by='positions')
        print('Data is sorted by positions.')
    else:
        df_all = df_all
        print('Data is sorted by patients (default).')
    
    ##_____conditions of 'save'_____
    
    if save == True:
        
        # save as csv
        filename = str(tma_loc[0]) + ".csv"
        df_all.to_csv(filename, index=True)
        print(f'Data is save as {filename}')
        
        # rearrange list according TMA's export
        list_final = df_all['annotated'].tolist()
        chunk_size = 13
        chunked_list = list()

        for i in reversed(range(0, len(list_final), chunk_size)):
            chunked_list.append(list_final[i:i+chunk_size])

        list_final = list()
        for i in chunked_list:
            list_final = list_final + i
        
        # save annotated list as text file
        filename_txt = str(tma_loc[0]) + ".txt"
        text_file = open(filename_txt, 'w')
        for i in list_final:
            text_file.write(i + '\n')
        text_file.close()
        print(f'Annotated list is save as {filename_txt}')
    
    return df_all

In [503]:
def renamer(path, textfile, imgtype='.png'):
    '''
    Functions:
    Rename the core images exported by QuPath 
    using the produced textfile (created from CreateDf function)
    as a reference.
    
    Arguments:
    path (str) = directory that contains both TMA core images and the text file
    textfile (str) = the textfile name with .txt
    imgtype (str) = the TMA core images type
    
    '''
    import os
    os.chdir(FILE_PATH)
    
    ##_____get the cores_____
    
    # get TMA core images from the file path
    filelist = []
    for file in os.listdir(FILE_PATH):
        if file.endswith(imgtype):
            filelist.append(file)
    filelist = natsorted(filelist)
    #for i in filelist: print(i)
    
    ##_____get the filenames_____
    
    # get the file names from the text file
    filenames = open(textfile, 'r')
    filenameslines = filenames.readlines()
    filenameslines_new = []
    count = -1
    for i in filenameslines:
        count += 1
        x = filenameslines[count].strip()
        filenameslines_new.append(x)
        #print(x)
    
    ##_____renaming the files_____
    
    # rename each files accordingly
    src = []
    for i in filelist:
        i = os.getcwd()+'\\'+i
        src.append(i)

    dst = []
    for i in filenameslines_new:
        i = os.getcwd()+'\\'+i+imgtype
        dst.append(i)

    count = -1
    for i in src:
        count += 1
        os.rename(i, dst[count])
        print(f"Renaming: {i}\n to: {dst[count]}")

# 3.0 Process the data

In [508]:
# get all the TMAs into list
for i in tma_list:
    print(f"_____Processing {i[0]}_____")
    CreateDfs(i,sort=True, save=True)

_____Processing TMA1_____
Data is sorted by positions.
Data is save as TMA1.csv
Annotated list is save as TMA1.txt
_____Processing TMA2_____
Data is sorted by positions.
Data is save as TMA2.csv
Annotated list is save as TMA2.txt
_____Processing TMA3_____
Data is sorted by positions.
Data is save as TMA3.csv
Annotated list is save as TMA3.txt
_____Processing TMA4_____
Data is sorted by positions.
Data is save as TMA4.csv
Annotated list is save as TMA4.txt
_____Processing TMA5_____
Data is sorted by positions.
Data is save as TMA5.csv
Annotated list is save as TMA5.txt
_____Processing TMA6_____
Data is sorted by positions.
Data is save as TMA6.csv
Annotated list is save as TMA6.txt
_____Processing TMA7_____
Data is sorted by positions.
Data is save as TMA7.csv
Annotated list is save as TMA7.txt
_____Processing TMA8_____
Data is sorted by positions.
Data is save as TMA8.csv
Annotated list is save as TMA8.txt
_____Processing TMA9_____
Data is sorted by positions.
Data is save as TMA9.csv


In [None]:
# renames the files

# 4.0 Etc

Refer https://stackoverflow.com/questions/42414968/insert-a-row-in-a-pandas-dataframe-without-changing-to-a-list-in-python to insert rows in df

Refer https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list for sorting