In [1]:
# Mounting to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
import torchvision.transforms as transforms
from torchvision.io import read_image
import pandas as pd
import numpy as np
import os
from glob import glob
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
# Importing the data
df = pd.read_csv('/content/drive/MyDrive/HAIFinalProject/archive/HAM10000_metadata.csv')
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [4]:
# Checking data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     10015 non-null  object 
 1   image_id      10015 non-null  object 
 2   dx            10015 non-null  object 
 3   dx_type       10015 non-null  object 
 4   age           9958 non-null   float64
 5   sex           10015 non-null  object 
 6   localization  10015 non-null  object 
dtypes: float64(1), object(6)
memory usage: 547.8+ KB


In [5]:
# Defining more readable labels for when I was playing with the data
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}
base_skin_dir = '/content/drive/MyDrive/HAIFinalProject/archive'


# Merging images into a single dataset
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}


# New columns for the image and cell type
df['path'] = df['image_id'].map(imageid_path_dict.get)
df['cell_type'] = df['dx'].map(lesion_type_dict.get)
df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes

df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAIFinalProject/archive...,Benign keratosis-like lesions,2
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAIFinalProject/archive...,Benign keratosis-like lesions,2
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAIFinalProject/archive...,Benign keratosis-like lesions,2
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,/content/drive/MyDrive/HAIFinalProject/archive...,Benign keratosis-like lesions,2
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,/content/drive/MyDrive/HAIFinalProject/archive...,Benign keratosis-like lesions,2


In [6]:
# Data Cleaning - Checking for Missing Values
df.isna().sum()

lesion_id         0
image_id          0
dx                0
dx_type           0
age              57
sex               0
localization      0
path              0
cell_type         0
cell_type_idx     0
dtype: int64

In [7]:
# Since all missing values are in the age column I am just imputing with the mean
df['age'].fillna((df['age'].mean()), inplace=True)

# Confirming no remaining missing values
df.isna().sum().sum()

0

In [8]:
# Define a function to resize images in batches (originally I did not use batch processing but the computational load was too big for my computer)
def resize_images_in_batch(image_paths, size=(100, 75)):
    resized_images = []
    for path in image_paths:
        img = np.asarray(Image.open(path).resize(size))
        resized_images.append(img)
    return np.array(resized_images)

In [9]:
# Resizing images in batches (to reduce processing time when it comes to applying the model)
image_paths = df['path'].tolist()
batch_size = 100

# Process and store resized images
resized_images = []
for i in tqdm(range(0, len(image_paths), batch_size), desc="Resizing Images"):
    batch = image_paths[i:i + batch_size]
    resized_images.extend(resize_images_in_batch(batch))

# Adding resized images to the DataFrame
df['image'] = resized_images

Resizing Images: 100%|██████████| 101/101 [27:37<00:00, 16.41s/it]


In [10]:
# Checking the image size distribution to trouble shoot data size errors in my model
df['image'].map(lambda x: x.shape).value_counts()

(75, 100, 3)    10015
Name: image, dtype: int64

In [11]:
# Setting the target variable
features=df.drop(columns=['cell_type_idx'],axis=1)
target=df['cell_type_idx']

In [12]:
# Splitting the dataset into training and testing set
x_train_o, x_test_o, y_train_o, y_test_o = train_test_split(features, target, test_size=0.20,random_state=1234)

In [13]:
# Normalization of images
x_train = np.asarray(x_train_o['image'].tolist())
x_test = np.asarray(x_test_o['image'].tolist())
x_train_mean = np.mean(x_train)
x_train_std = np.std(x_train)
x_test_mean = np.mean(x_test)
x_test_std = np.std(x_test)
x_train = (x_train - x_train_mean)/x_train_std
x_test = (x_test - x_test_mean)/x_test_std

In [14]:
# Renaming the target sets
y_train = y_train_o
y_test = y_test_o

In [15]:
# Confirming shape alignment
print(x_train.shape)
print(y_train.shape)

(8012, 75, 100, 3)
(8012,)


In [16]:
# Splitting into training and validation sets (subsampling from the existing training sets)
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size = 0.1, random_state = 2)

In [17]:
# Reshaping image in 3 dimensions (channel = 3, height = 75px, width = 100px)
x_train = x_train.reshape(x_train.shape[0], 3, 75, 100)
x_test = x_test.reshape(x_test.shape[0], 3, 75, 100)
x_validate = x_validate.reshape(x_validate.shape[0], 3, 75, 100)

In [18]:
# Converting numpy arrays to PyTorch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)
x_validate_tensor = torch.tensor(x_validate, dtype=torch.float32)
y_validate_tensor = torch.tensor(y_validate.to_numpy(), dtype=torch.float32)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.float32)


# Create TensorDatasets
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
validate_dataset = TensorDataset(x_validate_tensor, y_validate_tensor)
test_dataset = TensorDataset(x_test_tensor, y_test_tensor)


# Defining batch size
batch_size = 64

# Creating data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validate_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Inspecting a batch of data from the loader to trouble shoot error messages
for inputs, labels in train_loader:
    print("Input batch shape:", inputs.shape)
    print("Labels batch shape:", labels.shape)
    print("Labels data type:", labels.dtype)
    break  # Only inspect the first batch

Input batch shape: torch.Size([64, 3, 75, 100])
Labels batch shape: torch.Size([64])
Labels data type: torch.float32


In [None]:
# Defining the VAE components (encoder, sampling layer, decoder)
class Sampling(nn.Module):
    def forward(self, z_mean, z_log_var):
        batch, dim = z_mean.size()
        epsilon = torch.randn(batch, dim)
        return epsilon * torch.exp(z_log_var * 0.5) + z_mean

class Encoder(nn.Module):
    def __init__(self, input_shape, num_latent_vars):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(28800, 128)
        self.fc2 = nn.Linear(128, num_latent_vars)
        self.fc3 = nn.Linear(128, num_latent_vars)
        self.sampling = Sampling()

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc1(x))
        z_mean = self.fc2(x)
        z_log_var = self.fc3(x)
        return self.sampling(z_mean, z_log_var)

class Decoder(nn.Module):
    def __init__(self, num_latent_vars, output_shape):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(num_latent_vars, 128)
        self.fc2 = nn.Linear(128, 64 * 18 * 25)
        self.conv1 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, output_shape[0], kernel_size=3, padding=1)
        self.output_shape = output_shape

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = x.view(-1, 64, 18, 25)
        x = F.interpolate(x, scale_factor=2, mode='nearest')
        x = F.pad(x, (0, 1, 0, 1))
        x = F.relu(self.conv1(x))
        x = F.interpolate(x, scale_factor=2, mode='nearest')
        x = F.pad(x, (0, 1, 0, 1))
        x = F.relu(self.conv2(x))
        reconstruction = torch.sigmoid(self.conv3(x))
        return reconstruction

class VAE(nn.Module):
    def __init__(self, input_shape, num_latent_vars):
        super(VAE, self).__init__()
        self.encoder = Encoder(input_shape, num_latent_vars)
        self.decoder = Decoder(num_latent_vars, input_shape)

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

# Adjusting input shape for PyTorch (Channels, Height, Width)
input_shape = (3, 75, 100)
num_latent_vars = 7

# Creating the VAE
vae = VAE(input_shape, num_latent_vars)

# Defining the modified CNN architecture
modified_cnn = nn.Sequential(
    nn.Linear(num_latent_vars, 128),
    nn.ReLU(),
    nn.Linear(128, 1024),
    nn.ReLU(),
    nn.Unflatten(1, (1, 32, 32)),  # Reshape to (1, 32, 32) for convolutional layers
    nn.Conv2d(1, 32, kernel_size=3, padding='same'),
    nn.ReLU(),
    nn.Conv2d(32, 32, kernel_size=3, padding='same'),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Dropout(0.25),
    nn.Conv2d(32, 64, kernel_size=3, padding='same'),
    nn.ReLU(),
    nn.Conv2d(64, 64, kernel_size=3, padding='same'),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Dropout(0.40),
    nn.Flatten(),
    nn.Linear(64 * 8 * 8, 128),  # Adjusted to match the flattening dimensions
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(128, num_latent_vars),
    nn.LogSoftmax(dim=1)
)

# Integrating VAE encoder with the modified CNN
class IntegratedModel(nn.Module):
    def __init__(self, vae, modified_cnn):
        super(IntegratedModel, self).__init__()
        self.vae = vae
        self.modified_cnn = modified_cnn

    def forward(self, x):
        vae_features = self.vae.encoder(x)
        return self.modified_cnn(vae_features)

integrated_model = IntegratedModel(vae, modified_cnn)

# Defining loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(integrated_model.parameters(), lr=0.001)


# Model training function
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=5):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()

            outputs = model(inputs)
            labels = labels.long()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}")

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                labels = labels.long()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f"Validation Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%")

# Model evaluation function
def evaluate_model(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            labels = labels.long()
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Test Loss: {test_loss/len(test_loader)}, Accuracy: {100 * correct / total}%")

# Training the model
train_model(integrated_model, criterion, optimizer, train_loader, validate_loader, epochs=10)

# Evaluating the model
evaluate_model(integrated_model, test_loader, criterion)


Epoch 1/10, Loss: 1.2220297766997752
Validation Loss: 1.0325992703437805, Accuracy: 68.07980049875312%
Epoch 2/10, Loss: 0.9902514577966876
Validation Loss: 1.0304542779922485, Accuracy: 68.07980049875312%
Epoch 3/10, Loss: 0.935999696761106
Validation Loss: 0.9054578634408804, Accuracy: 68.70324189526184%
Epoch 4/10, Loss: 0.9082634485928358
Validation Loss: 0.8917347284463736, Accuracy: 68.3291770573566%
Epoch 5/10, Loss: 0.9097358790119138
Validation Loss: 0.8718614807495704, Accuracy: 69.45137157107231%
Epoch 6/10, Loss: 0.8716446725668106
Validation Loss: 0.8698230890127329, Accuracy: 70.94763092269326%
Epoch 7/10, Loss: 0.8661355117780972
Validation Loss: 0.865370498253749, Accuracy: 69.57605985037407%
Epoch 8/10, Loss: 0.8351219328103867
Validation Loss: 0.8550793115909283, Accuracy: 70.69825436408978%
Epoch 9/10, Loss: 0.8082806694824084
Validation Loss: 0.8287592117603009, Accuracy: 70.57356608478803%
Epoch 10/10, Loss: 0.7913832577456416
Validation Loss: 0.8232902976182791, A