In [1]:
"""
FYP project imaging
"""

import os
from os.path import exists
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm as tqdm

# Our own extract_features function. See extract_features.py for more info
from extract_features import extract_features

In [2]:
#Finding the raw data (Make sure you have the same file/folder structure)
file_data = '.' + os.sep + 'data' + os.sep +'metadata.csv'
path_image = '.' + os.sep + 'data' + os.sep + 'images' + os.sep + 'imgs_part_1'
path_mask = '.' + os.sep + 'data' + os.sep + 'images' + os.sep + 'masks_part_1'

# Where we will store the features:
file_features = 'features/features.csv'

In [4]:
#Read meta-data into a Pandas dataframe (This is from the PAD-UFES-20 Dataset)
df = pd.read_csv(file_data)

# Extract image IDs and labels from the data. 
label = np.array(df['diagnostic'])
mask_id = list(os.listdir(path_mask))

# Filter out images from the metadata that are not present in our image folder.
our_list = list(os.listdir(path_image))
filtered_features = df[df["img_id"].isin(our_list)]
image_id = list(filtered_features['img_id'])
filtered_features

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,,,,,8,,,,...,,NEV,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,BCC,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,,,,,77,,,,...,,ACK,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,,,,,75,,,,...,,ACK,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,BCC,True,True,False,False,True,True,PAT_684_1302_588.png,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293,PAT_1708,3156,,,,,73,,,,...,,ACK,True,False,False,False,False,False,PAT_1708_3156_175.png,False
2294,PAT_46,880,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,12.0,BCC,True,True,False,True,False,False,PAT_46_880_140.png,True
2295,PAT_1343,1217,,,,,74,,,,...,,SEK,False,False,False,False,False,False,PAT_1343_1217_404.png,False
2296,PAT_326,690,False,False,POMERANIA,POMERANIA,58,True,FEMALE,True,...,4.0,BCC,True,False,False,False,False,True,PAT_326_690_823.png,True


In [8]:
# Using this for a list of file names without .png extension. Very hacky, might need a rewrite
foo = [x[:-4] for x in image_id]

# Make an array to store features.
feature_names = ['file_name', 'asymmetry', 'color', 'blue-white_veil']
num_features = len(feature_names)
num_images = len(filtered_features)
features = np.zeros([num_images,num_features], dtype=np.float16)  

# Make a list for each feature to store the values.
filename = []
asym = []
col = []
blue_white = []

# The big loop - will run as many iterations as there are images in the folder. 
for i in tqdm.tqdm(range(len(filtered_features))):
    
    # Define filenames related to this image - Uses the hacky foo variable found above to find the image and mask, and to get the naming for the csv.
    file_image = path_image + os.sep + foo[i] + '.png'
    mask_image = path_mask + os.sep + foo[i] + '_mask.png'
    temp_image = foo[i] + ".png"
    
    # If image does not have a corresponding mask, we skip it.
    if file_image and os.path.exists(mask_image):
        # Extract the features with our function and appends the outputs to their corresponding feature lists.
        x = extract_features(file_image, mask_image, temp_image)
        
        filename.append(x[0])
        asym.append(x[1])
        col.append(x[2])
        blue_white.append(x[3])
        


 50%|█████████████████████████████████████▏                                    | 1156/2297 [2:02:26<2:00:51,  6.36s/it]


ValueError: zero-size array to reduction operation minimum which has no identity

In [9]:
#Assign all the list values to a dict, then parse these to our data frame before writing it to the csv.
assign_vals = dict(zip(feature_names, [filename, asym, col, blue_white]))

df_features = pd.DataFrame.from_dict(assign_vals)     
df_features.to_csv(file_features, index=False)

df_features

Unnamed: 0,file_name,asymmetry,color,blue-white_veil
0,PAT_1516_1765_530.png,0.231,2,0.000000
1,PAT_46_881_939.png,0.600,2,0.001784
2,PAT_1545_1867_547.png,0.478,2,0.000000
3,PAT_1989_4061_934.png,0.316,2,0.000196
4,PAT_684_1302_588.png,0.231,3,0.000000
...,...,...,...,...
1057,PAT_1761_3329_837.png,1.028,4,0.006234
1058,PAT_564_1079_430.png,0.190,2,0.000000
1059,PAT_747_1409_116.png,0.283,3,0.004494
1060,PAT_566_179_23.png,1.041,1,0.114178
