# Encode features in metadata

**ADDED ENCODING**! Order:
- 'df', 'nv', 'bkl', 'mel', 'vasc', 'bcc', 'akiec' --> 0, 1, 2, 3, 4, 5, 6

- head/neck, lower extremity, oral/genital, palms/soles, torso, upper extremity --> 0, 1, 2, 3, 4, 5
- male, female --> 0, 1

In [1]:
# imports
import pandas as pd
import numpy as np
import torch
import time
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torchvision.models as models
import timeit
import torchvision
# import torchsummary
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset
# from torchsummary import summary
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns

# to save session
import pickle

from tqdm import tqdm

In [2]:
print("PyTorch version:")
print(torch.__version__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

PyTorch version:
2.2.1+cu121
Using cpu device


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Define paths
(comment out whichever not in use)

In [None]:
# ## FOR JUPYTER
# metadata_path = 'data/encoded_final_data.csv'
# images_path = 'processed_images'

In [4]:
## FOR GOOGLE COLAB
# paths
metadata_path = '/content/drive/MyDrive/CDS Project/data/encoded_final_data.csv'
images_path = "/content/drive/MyDrive/CDS Project/directly_processed_images"

In [5]:
metadata_df = pd.read_csv(metadata_path)
metadata_df

Unnamed: 0,image_id,age,head/neck,lower extremity,oral/genital,palms/soles,torso,upper extremity,female,male,target
0,ISIC_5136612,65.0,0,False,False,0,1,False,False,True,0
1,ISIC_0031002,65.0,0,False,False,0,0,True,False,True,0
2,ISIC_0026875,35.0,0,False,False,0,0,True,True,False,1
3,ISIC_0024371,70.0,0,True,False,0,0,False,False,True,2
4,ISIC_0027263,85.0,0,False,False,0,1,False,False,True,3
...,...,...,...,...,...,...,...,...,...,...,...
1395,ISIC_0027531,65.0,0,True,False,0,0,False,False,True,5
1396,ISIC_0025691,50.0,0,False,False,0,1,False,True,False,5
1397,ISIC_0028381,55.0,1,False,False,0,0,False,True,False,6
1398,ISIC_0033254,15.0,0,False,False,0,1,False,False,True,4


### Change metadata to represent each demographic feature in one column
- use numerical values to represent localization and gender
- head/neck, lower extremity, oral/genital, palms, soles, torso --> 0, 1, 2, 3, 4, 5
- male, female --> 0, 1

In [6]:
metadata_df = metadata_df.replace({False: 0, True: 1})
metadata_df

Unnamed: 0,image_id,age,head/neck,lower extremity,oral/genital,palms/soles,torso,upper extremity,female,male,target
0,ISIC_5136612,65.0,0,0,0,0,1,0,0,1,0
1,ISIC_0031002,65.0,0,0,0,0,0,1,0,1,0
2,ISIC_0026875,35.0,0,0,0,0,0,1,1,0,1
3,ISIC_0024371,70.0,0,1,0,0,0,0,0,1,2
4,ISIC_0027263,85.0,0,0,0,0,1,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...
1395,ISIC_0027531,65.0,0,1,0,0,0,0,0,1,5
1396,ISIC_0025691,50.0,0,0,0,0,1,0,1,0,5
1397,ISIC_0028381,55.0,1,0,0,0,0,0,1,0,6
1398,ISIC_0033254,15.0,0,0,0,0,1,0,0,1,4


In [7]:
# Map categorical values to numerical values
localization_map = {'head/neck': 0, 'lower extremity': 1, 'oral/genital': 2, 'palms/soles': 3, 'torso': 4, 'upper extremity': 5}
gender_map = {'male': 0, 'female': 1}

edited_metadata_df = metadata_df.copy()

# Replace categorical values with numerical representations for localization
localization_columns = metadata_df.iloc[:, 2:8]

for index, row in localization_columns.iterrows():
    for column, value in row.items():
        if value == 1:
            edited_metadata_df.at[index, 'localization'] = localization_map[column]

# Replace categorical values with numerical representations for gender
gender_columns = metadata_df.iloc[:, 8:10]

for index, row in gender_columns.iterrows():
    for column, value in row.items():
        if value == 1:
            edited_metadata_df.at[index, 'gender'] = gender_map[column]

edited_metadata_df['localization'] = edited_metadata_df['localization'].astype(int)
edited_metadata_df['gender'] = edited_metadata_df['gender'].astype(int)

edited_metadata_df.drop(columns=['head/neck', 'lower extremity', 'oral/genital', 'palms/soles', 'torso', 'upper extremity', 'female', 'male'], axis=1, inplace=True)

edited_metadata_df

Unnamed: 0,image_id,age,target,localization,gender
0,ISIC_5136612,65.0,0,4,0
1,ISIC_0031002,65.0,0,5,0
2,ISIC_0026875,35.0,1,5,1
3,ISIC_0024371,70.0,2,1,0
4,ISIC_0027263,85.0,3,4,0
...,...,...,...,...,...
1395,ISIC_0027531,65.0,5,1,0
1396,ISIC_0025691,50.0,5,4,1
1397,ISIC_0028381,55.0,6,0,1
1398,ISIC_0033254,15.0,4,4,0


In [None]:
# new_path = 'data'
new_path = '/content/drive/MyDrive/CDS Project/data/encoded_columns_data.csv'

In [8]:
# save to csv
edited_metadata_df.to_csv(new_path, index=False)

In [9]:
metadata_df = pd.read_csv('/content/drive/MyDrive/CDS Project/data/encoded_columns_data.csv')
metadata_df

Unnamed: 0,image_id,age,target,localization,gender
0,ISIC_5136612,65.0,0,4,0
1,ISIC_0031002,65.0,0,5,0
2,ISIC_0026875,35.0,1,5,1
3,ISIC_0024371,70.0,2,1,0
4,ISIC_0027263,85.0,3,4,0
...,...,...,...,...,...
1395,ISIC_0027531,65.0,5,1,0
1396,ISIC_0025691,50.0,5,4,1
1397,ISIC_0028381,55.0,6,0,1
1398,ISIC_0033254,15.0,4,4,0
