In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define paths
data_dir = r'D:\7th sem\computer vision\project\histopathological image dataset for ET\histopathological image dataset for ET'  # Replace with the correct path
output_dir = r'D:\7th sem\computer vision\project\histopathological image dataset for ET\split_dataset'

# Create base directories for train, val, and test
os.makedirs(output_dir, exist_ok=True)
os.makedirs(f'{output_dir}/train', exist_ok=True)
os.makedirs(f'{output_dir}/val', exist_ok=True)
os.makedirs(f'{output_dir}/test', exist_ok=True)

# Process each folder, treating each subfolder as its own class
for main_class_name in os.listdir(data_dir):
    main_class_path = os.path.join(data_dir, main_class_name)
    
    if os.path.isdir(main_class_path):
        # Check for subdirectories within the main class folder
        subdirs = [d for d in os.listdir(main_class_path) if os.path.isdir(os.path.join(main_class_path, d))]
        
        if subdirs:
            # If subfolders exist, treat each subfolder as a distinct class
            for subdir in subdirs:
                subdir_path = os.path.join(main_class_path, subdir)
                class_name = f"{main_class_name}_{subdir}"  # Unique class name, e.g., "NE_Follicular"
                images = os.listdir(subdir_path)

                # Skip if there are not enough images to split
                if len(images) < 4:
                    print(f"Skipping class '{class_name}' as it has fewer than 4 images.")
                    continue

                # Split into train (70%), val (15%), and test (15%)
                train_images, temp_images = train_test_split(images, test_size=0.3, random_state=42)
                if len(temp_images) >= 2:
                    val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)
                else:
                    val_images, test_images = temp_images, []

                # Create class directories within train, val, and test folders
                for split in ['train', 'val', 'test']:
                    os.makedirs(f'{output_dir}/{split}/{class_name}', exist_ok=True)

                # Copy images to the respective split folders
                for image in train_images:
                    shutil.copy(os.path.join(subdir_path, image), f'{output_dir}/train/{class_name}/{image}')
                
                for image in val_images:
                    shutil.copy(os.path.join(subdir_path, image), f'{output_dir}/val/{class_name}/{image}')
                
                for image in test_images:
                    shutil.copy(os.path.join(subdir_path, image), f'{output_dir}/test/{class_name}/{image}')
        else:
            # No subfolders; treat main folder as a single class
            class_name = main_class_name
            images = os.listdir(main_class_path)

            # Skip if there are not enough images to split
            if len(images) < 4:
                print(f"Skipping class '{class_name}' as it has fewer than 4 images.")
                continue

            # Split into train (70%), val (15%), and test (15%)
            train_images, temp_images = train_test_split(images, test_size=0.3, random_state=42)
            if len(temp_images) >= 2:
                val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)
            else:
                val_images, test_images = temp_images, []

            # Create class directories within train, val, and test folders
            for split in ['train', 'val', 'test']:
                os.makedirs(f'{output_dir}/{split}/{class_name}', exist_ok=True)

            # Copy images to the respective split folders
            for image in train_images:
                shutil.copy(os.path.join(main_class_path, image), f'{output_dir}/train/{class_name}/{image}')
            
            for image in val_images:
                shutil.copy(os.path.join(main_class_path, image), f'{output_dir}/val/{class_name}/{image}')
            
            for image in test_images:
                shutil.copy(os.path.join(main_class_path, image), f'{output_dir}/test/{class_name}/{image}')

print("Dataset split completed and organized into train, val, and test folders.")


Dataset split completed and organized into train, val, and test folders.


In [2]:
!pip install torch torchvision





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix

# Define paths to the train, validation, and test directories
train_dir = r'D:\Engineering\SEM7\COMPUTER VISION\proj_imp\DATASET\org_datahistopath_img_data\train'
val_dir = r'D:\Engineering\SEM7\COMPUTER VISION\proj_imp\DATASET\org_datahistopath_img_data\val'
test_dir = r'D:\Engineering\SEM7\COMPUTER VISION\proj_imp\DATASET\org_datahistopath_img_data\test'

# Define image transformations (resize, convert to tensor, normalize)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resizing to fit model input (e.g., ResNet50)
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Standard ImageNet normalization
])

# Load the datasets
train_dataset = ImageFolder(root=train_dir, transform=transform)
val_dataset = ImageFolder(root=val_dir, transform=transform)
test_dataset = ImageFolder(root=test_dir, transform=transform)

# Create data loaders for batching and shuffling
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Check the number of classes and class names
num_classes = len(train_dataset.classes)
print(f"Number of classes: {num_classes}")
print("Class names:", train_dataset.classes)


Number of classes: 6
Class names: ['EA', 'EH_Complex', 'EH_Simple', 'EP', 'NE_Follicular', 'NE_Luteal']


# VGG

In [3]:
import torch.nn as nn
from torchvision import models

# Load the pretrained model
model = models.vgg16(weights='VGG16_Weights.DEFAULT')

# Modify the last layer to match the number of classes (7 in this case)
num_classes = 7
model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)

# Move model to the correct device (if using GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [10]:
import torch
import torch.nn as nn
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader
import os
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Dataset path
data_dir = r'D:\Engineering\SEM7\COMPUTER VISION\proj_imp\DATASET\org_datahistopath_img_data'
batch_size = 32
img_size = 224  # VGG16 requires 224x224 input

# Define transformations
transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load dataset
train_data = datasets.ImageFolder(root=os.path.join(data_dir, 'train'), transform=transform)
val_data = datasets.ImageFolder(root=os.path.join(data_dir, 'val'), transform=transform)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=4)

# Load pre-trained VGG16
model = models.vgg16(pretrained=True)

# Modify the classifier for our dataset
num_features = model.classifier[6].in_features
model.classifier[6] = nn.Linear(num_features, len(train_data.classes))
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

def plot_confusion_matrix(cm, classes):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()

def evaluate_model(model, data_loader, criterion):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    total = 0
    correct = 0

    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader.dataset)
    accuracy = 100 * correct / total
    
    return avg_loss, accuracy, all_preds, all_labels

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    best_val_acc = 0.0
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = 100 * correct / total
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

        # Validation phase
        val_loss, val_acc, _, _ = evaluate_model(model, val_loader, criterion)
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%\n")
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'vgg16_best_model.pth')

    return model

# Train the model
print("Starting training...")
model = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=100)

# Load best model for final evaluation
model.load_state_dict(torch.load('vgg16_best_model.pth'))

# Generate classification report and confusion matrix
print("\nGenerating final evaluation metrics...")
_, _, val_preds, val_labels = evaluate_model(model, val_loader, criterion)

# Print classification report
print("\nClassification Report:")
print(classification_report(val_labels, val_preds, target_names=train_data.classes))

# Generate and save confusion matrix
cm = confusion_matrix(val_labels, val_preds)
plot_confusion_matrix(cm, train_data.classes)
print("\nConfusion matrix has been saved as 'confusion_matrix.png'")

# Save the final model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'class_to_idx': train_data.class_to_idx
}, 'vgg16_final_model.pth')

Using device: cuda
Starting training...
Epoch [1/100], Loss: 1.2003, Accuracy: 49.56%
Validation Loss: 1.0873, Validation Accuracy: 59.15%

Epoch [2/100], Loss: 0.8609, Accuracy: 65.52%
Validation Loss: 0.8200, Validation Accuracy: 65.85%

Epoch [3/100], Loss: 0.6357, Accuracy: 75.20%
Validation Loss: 0.8156, Validation Accuracy: 68.50%

Epoch [4/100], Loss: 0.4904, Accuracy: 81.60%
Validation Loss: 1.0148, Validation Accuracy: 62.80%

Epoch [5/100], Loss: 0.3536, Accuracy: 86.92%
Validation Loss: 1.3238, Validation Accuracy: 58.54%

Epoch [6/100], Loss: 0.2174, Accuracy: 92.15%
Validation Loss: 1.1572, Validation Accuracy: 70.33%

Epoch [7/100], Loss: 0.1079, Accuracy: 96.34%
Validation Loss: 1.4236, Validation Accuracy: 63.82%

Epoch [8/100], Loss: 0.1330, Accuracy: 96.03%
Validation Loss: 1.2894, Validation Accuracy: 62.40%

Epoch [9/100], Loss: 0.1176, Accuracy: 95.42%
Validation Loss: 1.2422, Validation Accuracy: 71.75%

Epoch [10/100], Loss: 0.0968, Accuracy: 96.56%
Validation Lo

  model.load_state_dict(torch.load('vgg16_best_model.pth'))



Generating final evaluation metrics...

Classification Report:
               precision    recall  f1-score   support

           EA       0.79      0.89      0.84        80
   EH_Complex       0.53      0.45      0.49        42
    EH_Simple       0.77      0.79      0.78        78
           EP       0.63      0.61      0.62        95
NE_Follicular       0.69      0.69      0.69       107
    NE_Luteal       0.83      0.79      0.81        90

     accuracy                           0.72       492
    macro avg       0.70      0.70      0.70       492
 weighted avg       0.72      0.72      0.72       492


Confusion matrix has been saved as 'confusion_matrix.png'
