Using only two major classes

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import collections
import collections.abc
collections.Iterable = collections.abc.Iterable
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [19]:
# Dataset Configuration
BASE_DIR = 'archive'
IMAGE_DIRS = {
    'part1': 'imgs_part_1/imgs_part_1',
    'part2': 'imgs_part_2/imgs_part_2', 
    'part3': 'imgs_part_3/imgs_part_3'
}
METADATA_FILE = 'metadata.csv'



In [20]:
# Load the dataset
df = pd.read_csv(os.path.join(BASE_DIR, METADATA_FILE))
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Display basic info
df.info()


Dataset shape: (2298, 26)
Columns: ['patient_id', 'lesion_id', 'smoke', 'drink', 'background_father', 'background_mother', 'age', 'pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 'has_piped_water', 'has_sewage_system', 'fitspatrick', 'region', 'diameter_1', 'diameter_2', 'diagnostic', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation', 'img_id', 'biopsed']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2298 entries, 0 to 2297
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   patient_id           2298 non-null   object 
 1   lesion_id            2298 non-null   int64  
 2   smoke                1494 non-null   object 
 3   drink                1494 non-null   object 
 4   background_father    1480 non-null   object 
 5   background_mother    1476 non-null   object 
 6   age                  2298 non-null   int64  
 7   pesticide            1494 non-null   object 
 8   gender   

Filtering dataset to only have major classes

In [21]:
df = df.loc[df['diagnostic'].isin(['BCC', 'ACK'])].copy()
df.reset_index(drop=True, inplace=True)

In [22]:
# Check missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing %': missing_percent
}).sort_values('Missing %', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

# Handle missing values strategically
print("Handling missing values:")

# For categorical variables with missing values, fill with 'Unknown'
categorical_columns = ['smoke', 'drink', 'background_father', 'background_mother', 
                      'pesticide', 'gender', 'skin_cancer_history', 'cancer_history',
                      'has_piped_water', 'has_sewage_system', 'fitspatrick', 'region']

for col in categorical_columns:
    if col in df.columns:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            df[col] = df[col].fillna('Unknown')
            print(f"  - {col}: Filled {missing_count} missing values with 'Unknown'")

# For numerical variables, fill with median
numerical_columns = ['age', 'diameter_1', 'diameter_2']
for col in numerical_columns:
    if col in df.columns:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            print(f"  - {col}: Filled {missing_count} missing values with median ({median_val})")

print(f"Missing values after cleaning: {df.isnull().sum().sum()}")


                     Missing Count  Missing %
background_mother              455  28.888889
background_father              452  28.698413
pesticide                      447  28.380952
gender                         447  28.380952
drink                          447  28.380952
smoke                          447  28.380952
skin_cancer_history            447  28.380952
has_sewage_system              447  28.380952
has_piped_water                447  28.380952
cancer_history                 447  28.380952
diameter_1                     447  28.380952
diameter_2                     447  28.380952
fitspatrick                    447  28.380952
Handling missing values:
  - smoke: Filled 447 missing values with 'Unknown'
  - drink: Filled 447 missing values with 'Unknown'
  - background_father: Filled 452 missing values with 'Unknown'
  - background_mother: Filled 455 missing values with 'Unknown'
  - pesticide: Filled 447 missing values with 'Unknown'
  - gender: Filled 447 missing values with 

In [23]:
##Check missing values again

df.isnull().sum()

patient_id             0
lesion_id              0
smoke                  0
drink                  0
background_father      0
background_mother      0
age                    0
pesticide              0
gender                 0
skin_cancer_history    0
cancer_history         0
has_piped_water        0
has_sewage_system      0
fitspatrick            0
region                 0
diameter_1             0
diameter_2             0
diagnostic             0
itch                   0
grew                   0
hurt                   0
changed                0
bleed                  0
elevation              0
img_id                 0
biopsed                0
dtype: int64

In [24]:
# Create age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 70, 100], 
                        labels=['Young', 'Middle', 'Senior', 'Elderly'])

# Create lesion size categories
df['lesion_area'] = df['diameter_1'] * df['diameter_2']
df['lesion_size'] = pd.cut(df['lesion_area'], bins=[0, 25, 100, 400, float('inf')], 
                          labels=['Small', 'Medium', 'Large', 'Very_Large'])

# Create symptom score (count of positive symptoms)
symptom_columns = ['itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
df['symptom_score'] = 0
for col in symptom_columns:
    if col in df.columns:
        df['symptom_score'] += (df[col] == True).astype(int)

# Create risk factors score
risk_factors = ['smoke', 'drink', 'pesticide', 'skin_cancer_history', 'cancer_history']
df['risk_score'] = 0
for col in risk_factors:
    if col in df.columns:
        df['risk_score'] += (df[col] == True).astype(int)

# Create text description for multimodal approach
def create_text_description(row):
    """Create a natural language description from metadata"""
    text_parts = []
    
    # Demographics
    if pd.notna(row['age']):
        text_parts.append(f"{int(row['age'])}-year-old")
    
    if pd.notna(row['gender']) and row['gender'] != 'Unknown':
        text_parts.append(row['gender'].lower())
    
    # Lesion location
    if pd.notna(row['region']) and row['region'] != 'Unknown':
        text_parts.append(f"lesion on {row['region'].lower()}")
    
    # Symptoms
    symptoms = []
    for symptom in symptom_columns:
        if symptom in row and row[symptom] == True:
            symptoms.append(symptom)
    
    if symptoms:
        text_parts.append(f"with symptoms: {', '.join(symptoms)}")
    
    # Risk factors
    risk_factors_present = []
    for risk in risk_factors:
        if risk in row and row[risk] == True:
            risk_factors_present.append(risk.replace('_', ' '))
    
    if risk_factors_present:
        text_parts.append(f"risk factors: {', '.join(risk_factors_present)}")
    
    return " ".join(text_parts)

df['text_description'] = df.apply(create_text_description, axis=1)

print("Feature engineering completed:")
print(f"Age groups: {df['age_group'].value_counts().to_dict()}")
print(f"Lesion sizes: {df['lesion_size'].value_counts().to_dict()}")
print(f"Symptom scores: {df['symptom_score'].describe()}")
print(f"Risk scores: {df['risk_score'].describe()}")
print(f"Text descriptions created: {len(df['text_description'])} samples")


Feature engineering completed:
Age groups: {'Senior': 805, 'Elderly': 488, 'Middle': 276, 'Young': 6}
Lesion sizes: {'Medium': 948, 'Large': 371, 'Small': 189, 'Very_Large': 64}
Symptom scores: count    1575.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: symptom_score, dtype: float64
Risk scores: count    1575.000000
mean        1.271746
std         1.227515
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max         5.000000
Name: risk_score, dtype: float64
Text descriptions created: 1575 samples


In [7]:
# Show sample text descriptions
print("Sample text descriptions:")
for i in range(3):
    print(f"{i+1}. {df['text_description'].iloc[i]}")


Sample text descriptions:
1. 8-year-old lesion on arm
2. 55-year-old female lesion on neck risk factors: skin cancer history, cancer history
3. 77-year-old lesion on face


In [25]:
class SkinLesionDataset(Dataset):
    """Custom dataset for skin lesion images with metadata"""
    def __init__(self, df, image_dirs, transform=None, target_transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dirs = image_dirs
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_id = row['img_id']
        
        # Find and load image
        img = None
        for part, dir_path in self.image_dirs.items():
            full_path = os.path.join(BASE_DIR, dir_path, img_id)
            if os.path.exists(full_path):
                try:
                    img = Image.open(full_path).convert('RGB')
                    break
                except Exception as e:
                    print(f"Error loading {img_id} from {part}: {e}")
                    continue
        
        if img is None:
            # Create a placeholder image if not found
            img = Image.new('RGB', (224, 224), color='black')
            print(f"Warning: Image {img_id} not found, using placeholder")
        
        # Apply transforms
        if self.transform:
            img = self.transform(img)
        
        # Get target (diagnostic category)
        target = row['diagnostic']
        if self.target_transform:
            target = self.target_transform(target)
        
        # Get metadata for multimodal approach
        metadata = {
            'age': row['age'],
            'gender': row['gender'],
            'region': row['region'],
            'symptom_score': row['symptom_score'],
            'risk_score': row['risk_score'],
            'text_description': row['text_description']
        }
        
        return img, target, metadata


In [26]:
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [27]:
# Check class distribution
print("Class distribution before splitting:")
class_counts = df['diagnostic'].value_counts()
print(class_counts)
print(f"Class balance ratio (max/min): {class_counts.max() / class_counts.min():.2f}")

# Create stratified splits
# First split: 80% train+val, 20% test
X = df.drop('diagnostic', axis=1)
y = df['diagnostic']

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: 80% train, 20% val from the remaining 80%
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

# Reconstruct dataframes
train_df = X_train.copy()
train_df['diagnostic'] = y_train

val_df = X_val.copy()
val_df['diagnostic'] = y_val

test_df = X_test.copy()
test_df['diagnostic'] = y_test


Class distribution before splitting:
diagnostic
BCC    845
ACK    730
Name: count, dtype: int64
Class balance ratio (max/min): 1.16


The filtered dataframe now has only two major classes

In [28]:
print(f"\nClass distribution in splits:")
for split_name, split_df in [('Train', filtered_train_df), ('Val', filtered_val_df), ('Test', filtered_test_df)]:
    split_counts = split_df['diagnostic'].value_counts()
    print(f"\n{split_name} set:")
    for class_name, count in split_counts.items():
        percentage = count / len(split_df) * 100
        print(f"  {class_name}: {count} ({percentage:.1f}%)")



Class distribution in splits:

Train set:
  BCC: 507 (53.7%)
  ACK: 438 (46.3%)

Val set:
  BCC: 169 (53.7%)
  ACK: 146 (46.3%)

Test set:
  BCC: 169 (53.7%)
  ACK: 146 (46.3%)


In [29]:
# Create label encoder for diagnostic categories
label_encoder = LabelEncoder()
all_labels = df['diagnostic'].unique()
label_encoder.fit(all_labels)

print(f"Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label} -> {i}")

Label mapping:
  ACK -> 0
  BCC -> 1


In [30]:
# Create datasets
train_dataset = SkinLesionDataset(train_df, IMAGE_DIRS, transform=train_transform)
val_dataset = SkinLesionDataset(val_df, IMAGE_DIRS, transform=val_transform)
test_dataset = SkinLesionDataset(test_df, IMAGE_DIRS, transform=val_transform)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

print(f"Batch size: {batch_size}")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

Batch size: 32
Training batches: 30
Validation batches: 10
Test batches: 10


In [31]:
# Save the processed dataframes
os.makedirs('processed_data_v2', exist_ok=True)

train_df.to_csv('processed_data_v2/train_df.csv', index=False)
val_df.to_csv('processed_data_v2/val_df.csv', index=False)
test_df.to_csv('processed_data_v2/test_df.csv', index=False)

# Save the full processed dataset
df.to_csv('processed_data_v2/full_processed_df.csv', index=False)

# Save label encoder
import pickle
with open('processed_data_v2/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save preprocessing configuration
preprocessing_config = {
    'image_size': (224, 224),
    'batch_size': batch_size,
    'num_classes': len(label_encoder.classes_),
    'class_names': label_encoder.classes_.tolist(),
    'train_samples': len(train_df),
    'val_samples': len(val_df),
    'test_samples': len(test_df),
    'image_normalization': {
        'mean': [0.485, 0.456, 0.406],
        'std': [0.229, 0.224, 0.225]
    }
}

import json
with open('processed_data_v2/preprocessing_config.json', 'w') as f:
    json.dump(preprocessing_config, f, indent=2)



Will retry training the CNN baseline now