# Data preparation: Pristine images

### Looking into the LuNoTim untampered masks for cancer

In [None]:
import pandas as pd
import glob 
import os 

# Looking into the LuNoTim untamperd masks for cancer 
root = 'untampered_scans/cancer_nodule_mask'
folders = { name.split('/')[-1]: name for name in glob.glob(root+'/*')}
print("%d untamperd CT scans" % len(folders))

df = pd.DataFrame(columns=['id_patient', 'id_nodule', 'id_slice', 'mask_path'])

# CT scan by patient 
for f in folders:
    #print("\n%s" % f)
    #print("\n%s" % folders[f])    
    sub_folders = { name.split('/')[-1]: name for name in glob.glob(folders[f]+"/*")}
    id_p = f.split("-")[-1]
    
    # Group of masks for the same patient (# I do not know whatthat means???)    
    for sf in sub_folders:
        #print("\n---%s" % sub_folders[sf])
        filenames = { name.split('/')[-1]: name for name in glob.glob(sub_folders[sf]+"/*")}
        id_nodule = sf
        # Masks 
        for fn in filenames:
            #print(fn)
            id_slice = int(fn.split('_')[0])
            df = df.append({'id_patient': id_p, 'id_nodule': id_nodule, 'id_slice': id_slice, 'mask_path': filenames[fn]}, ignore_index=True)     

### Looking into pristine CT scans from LIDC using pylidc 

In [None]:
import pylidc as pl
import numpy as np
from skimage import measure
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Rectangle


sliceData = []

for id_p in df['id_patient'].unique(): 
    print(id_p)
    pid = 'LIDC-IDRI-'+  id_p
    scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first()
    if scan:
        nods = scan.cluster_annotations()
    else:
        break    

    df_patient = df[df['id_patient']== id_p][{'id_nodule', 'id_slice'}]    
    for id_n,nod in enumerate(nods):

        annot = nods[id_n][0]
        i,j,k = annot.centroid
        slice_id = int(np.round(k))
        bmat = annot.bbox_matrix()
        
        
        df_nodules = df_patient[df_patient['id_nodule'] == str(id_n)]['id_slice']        
        x0, x1 = bmat[1]
        y0, y1 = bmat[0]
        offset = 10
        
        mask_seq = [(np.load(df.iloc[df_i]['mask_path'])).astype(float) for df_i in df_nodules.index ]
        mask_names = [ df.iloc[df_i]['mask_path'] for df_i in df_nodules.index ]    

        for index, mask in enumerate(mask_seq):
            regions = measure.label(mask)
            reg_props = measure.regionprops(regions)

            if reg_props: 
                min_row = y0-offset 
                min_col = x0-offset
                max_row = y1+offset
                max_col = x1+offset

                int_id_slice = int((mask_names[index].split('/')[-1]).split("_")[0])
                str_id_slice = ("%03d" % int_id_slice)

                #-----Add the slice mask information to the sliceData list----------
                # Class ------> prstine = '0', Tampered = '1'
                filename = "p_LIDC-cancer_LIDC-IDRI-%s_%s_%d.png" % (id_p,str_id_slice,id_n)                 
                sliceData.append([filename, max_row-min_row, max_col-min_col, 0, max_row, min_row, max_col, min_col])

columns = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
untampered_df = pd.DataFrame(sliceData, columns = columns)
untampered_df.to_csv('untampered_LIDC-IDRI_cancer.csv', index=False)