In [3]:
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
zip_file = 'Dataset.zip'  # Path to your zip file
unzip_dir = 'dataset'  # Directory where the files will be extracted

try:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)
except zipfile.BadZipFile:
    print("Error: The zip file might have unusual formatting or features.")
    print("Trying to extract using 'shutil' module as an alternative...")
    import shutil
    shutil.unpack_archive(zip_file, unzip_dir, 'zip')  # Use shutil to extract
    print("Extraction successful using shutil.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


In [5]:
data = []

# Adjusting paths based on your new directory structure
unzip_dir = 'dataset'  # This is the base folder where 'Dataset' is located
healthy_folder = os.path.join(unzip_dir, 'Dataset/Healthy')  # Path to 'Healthy' folder
unhealthy_folder = os.path.join(unzip_dir, 'Dataset/Unhealthy')  # Path to 'Unhealthy' folder

# Check if the folders exist (just for safety)
if not os.path.exists(healthy_folder):
    print(f"Healthy folder does not exist: {healthy_folder}")
if not os.path.exists(unhealthy_folder):
    print(f"Unhealthy folder does not exist: {unhealthy_folder}")

# Iterate through both folders and create labels
for label, folder in [('healthy', healthy_folder), ('unhealthy', unhealthy_folder)]:
    if os.path.exists(folder):  # Proceed only if the folder exists
        for filename in os.listdir(folder):
            if filename.endswith(('.png', '.jpg', '.jpeg', 'JPG')):  # Check for image files
                image_path = os.path.join(folder, filename)
                data.append([image_path, label])

# Step 3: Load into a pandas DataFrame
df = pd.DataFrame(data, columns=['image_path', 'label'])

# Step 4: Split the dataset into train and test sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Optionally, you can save these datasets to CSVs or use them in further processing
train_df.to_csv('train_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)

# Show the first few rows of the training and test data
print("Training Data:")
print(train_df['label'].value_counts())

print("Test Data:")
print(test_df.count())

Training Data:
label
healthy      6395
unhealthy    5299
Name: count, dtype: int64
Test Data:
image_path    2924
label         2924
dtype: int64


In [6]:
import os
import torch
import pandas as pd
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, datasets, models
from sklearn.model_selection import train_test_split
from efficientnet_pytorch import EfficientNet

# Step 1: Load the train and test datasets
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

In [74]:


# Step 2: Custom Dataset Class to Load Images
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 0]
        label = 1 if self.dataframe.iloc[idx, 1] == 'healthy' else 0
        image = datasets.folder.default_loader(img_path)
        
        if self.transform:
            image = self.transform(image)

        return image, label

# Step 3: Define transformations (for training and testing)
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),  # EfficientNet expects 224x224 input size
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

# Step 4: Create DataLoaders
train_dataset = CustomImageDataset(dataframe=train_df, transform=train_transforms)
test_dataset = CustomImageDataset(dataframe=test_df, transform=test_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 5: Load EfficientNet Model (Pre-trained)
model = EfficientNet.from_pretrained('efficientnet-b0')

# Modify the final layer for binary classification (2 classes: healthy, unhealthy)
model._fc = nn.Linear(in_features=model._fc.in_features, out_features=2)

# Step 6: Set device for training (CUDA if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Step 7: Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Step 8: Train the Model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100 * correct / total:.2f}%")

# Step 9: Evaluate the Model on Test Set
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = 100 * correct / total
print(f'Test Accuracy: {test_accuracy:.2f}%')


Loaded pretrained weights for efficientnet-b0
Epoch [1/10], Loss: 0.2089, Accuracy: 91.26%
Epoch [2/10], Loss: 0.0377, Accuracy: 98.84%
Epoch [3/10], Loss: 0.0193, Accuracy: 99.44%
Epoch [4/10], Loss: 0.0138, Accuracy: 99.52%
Epoch [5/10], Loss: 0.0106, Accuracy: 99.72%


KeyboardInterrupt: 

In [75]:
# Save the model's state_dict (weights)
torch.save(model.state_dict(), 'efficientnet_model2.pth')
print("Model saved successfully!")


Model saved successfully!


In [18]:
# Recreate the model architecture
model = EfficientNet.from_pretrained('efficientnet-b0')
model._fc = nn.Linear(in_features=model._fc.in_features, out_features=2)  # Modify for binary classification

# Load the saved state_dict
model.load_state_dict(torch.load('efficientnet_model.pth'))
model = model.to(device)  # Make sure to move the model to the correct device (CPU or GPU)
print("Model loaded successfully!")


Loaded pretrained weights for efficientnet-b0
Model loaded successfully!


In [76]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, average_precision_score
import numpy as np
import torch

# Step 10: Compute all necessary metrics (accuracy, precision, recall, F1 score, ROC AUC, PR AUC, confusion matrix)
model.eval()  # Set the model to evaluation mode
y_true = []
y_pred = []
y_prob = []  # To store predicted probabilities for ROC AUC and PR AUC

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())
        
        # Store the predicted probabilities for ROC AUC and PR AUC (softmax output)
        prob = torch.softmax(outputs, dim=1)[:, 1]  # Probabilities for the 'healthy' class
        y_prob.extend(prob.cpu().numpy())

# Convert to numpy arrays for easier handling with sklearn
y_true = np.array(y_true)
y_pred = np.array(y_pred)
y_prob = np.array(y_prob)

# Accuracy
accuracy = np.sum(y_pred == y_true) / len(y_true)

# Precision, Recall, F1 Score
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Classification Report
class_report = classification_report(y_true, y_pred)

# ROC AUC (Area Under the ROC Curve)
roc_auc = roc_auc_score(y_true, y_prob)

# PR AUC (Area Under the Precision-Recall Curve)
pr_auc = average_precision_score(y_true, y_prob)

# Print the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f'PR AUC: {pr_auc:.4f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy: 0.9959
Precision: 0.9950
Recall: 0.9975
F1 Score: 0.9962
ROC AUC: 0.9996
PR AUC: 0.9996
Confusion Matrix:
[[1320    8]
 [   4 1592]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1328
           1       0.99      1.00      1.00      1596

    accuracy                           1.00      2924
   macro avg       1.00      1.00      1.00      2924
weighted avg       1.00      1.00      1.00      2924



In [9]:
import os
import torch
import pandas as pd
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, datasets, models
from sklearn.model_selection import train_test_split

# Step 1: Load the train and test datasets
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

# Step 2: Custom Dataset Class to Load Images
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 0]
        label = 1 if self.dataframe.iloc[idx, 1] == 'healthy' else 0
        image = datasets.folder.default_loader(img_path)
        
        if self.transform:
            image = self.transform(image)

        return image, label

# Step 3: Define transformations (for training and testing)
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),  # ResNet expects 224x224 input size
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

# Step 4: Create DataLoaders
train_dataset = CustomImageDataset(dataframe=train_df, transform=train_transforms)
test_dataset = CustomImageDataset(dataframe=test_df, transform=test_transforms)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 5: Load ResNet Model (Pre-trained)
model = models.resnet50(pretrained=True)  # Use ResNet-18, or you can try resnet34, resnet50, etc.

# Modify the final layer for binary classification (2 classes: healthy, unhealthy)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)  # Output 2 classes

# Step 6: Set device for training (CUDA if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Step 7: Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Step 8: Train the Model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100 * correct / total:.2f}%")

# Step 9: Evaluate the Model on Test Set
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = 100 * correct / total
print(f'Test Accuracy: {test_accuracy:.2f}%')


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/shafayat/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:25<00:00, 4.04MB/s]


Epoch [1/10], Loss: 0.3288, Accuracy: 86.18%
Epoch [2/10], Loss: 0.1428, Accuracy: 95.70%
Epoch [3/10], Loss: 0.0931, Accuracy: 97.44%
Epoch [4/10], Loss: 0.0649, Accuracy: 97.33%
Epoch [5/10], Loss: 0.0533, Accuracy: 97.79%
Epoch [6/10], Loss: 0.0540, Accuracy: 97.56%
Epoch [7/10], Loss: 0.0418, Accuracy: 98.95%
Epoch [8/10], Loss: 0.0479, Accuracy: 97.79%
Epoch [9/10], Loss: 0.0455, Accuracy: 98.14%
Epoch [10/10], Loss: 0.0328, Accuracy: 98.61%
Test Accuracy: 97.69%


In [58]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, average_precision_score
import numpy as np
import torch

# Step 10: Compute all necessary metrics (accuracy, precision, recall, F1 score, ROC AUC, PR AUC, confusion matrix)
model.eval()  # Set the model to evaluation mode
y_true = []
y_pred = []
y_prob = []  # To store predicted probabilities for ROC AUC and PR AUC

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())
        
        # Store the predicted probabilities for ROC AUC and PR AUC (softmax output)
        prob = torch.softmax(outputs, dim=1)[:, 1]  # Probabilities for the 'healthy' class
        y_prob.extend(prob.cpu().numpy())

# Convert to numpy arrays for easier handling with sklearn
y_true = np.array(y_true)
y_pred = np.array(y_pred)
y_prob = np.array(y_prob)

# Accuracy
accuracy = np.sum(y_pred == y_true) / len(y_true)

# Precision, Recall, F1 Score
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Classification Report
class_report = classification_report(y_true, y_pred)

# ROC AUC (Area Under the ROC Curve)
roc_auc = roc_auc_score(y_true, y_prob)

# PR AUC (Area Under the Precision-Recall Curve)
pr_auc = average_precision_score(y_true, y_prob)

# Print the results
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')
print(f'PR AUC: {pr_auc:.4f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy: 0.9907
Precision: 0.9880
Recall: 1.0000
F1 Score: 0.9939
ROC AUC: 0.9999
PR AUC: 1.0000
Confusion Matrix:
[[196   8]
 [  0 657]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       204
           1       0.99      1.00      0.99       657

    accuracy                           0.99       861
   macro avg       0.99      0.98      0.99       861
weighted avg       0.99      0.99      0.99       861



In [11]:
# Save the model's state_dict
torch.save(model.state_dict(), 'resnet50_model.pth')
print("model saved successfully")

model saved successfully


In [54]:
model = models.resnet50(pretrained=False)

# Modify the final layer for binary classification (2 classes: healthy, unhealthy)
model.fc = torch.nn.Linear(in_features=model.fc.in_features, out_features=2)

# Step 2: Load the saved state_dict (weights), allowing for mismatch in final layer
model.load_state_dict(torch.load('resnet50_model.pth'), strict=False)
print("loaded successfully")



loaded successfully


In [64]:
test_df.iloc[9]['image_path']


image_path    861
label         861
dtype: int64

In [61]:
from PIL import Image
test_transformss = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

def predict_image(image_path):
    # Load the image from the file path
    image = Image.open(image_path)
    
    # Apply the necessary transformations
    image = test_transforms(image).unsqueeze(0)  # Add batch dimension
    
    # Move the image tensor to the device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    image = image.to(device)
    model.to(device)
    model.eval()

    # Perform the prediction
    with torch.no_grad():  # No need to calculate gradients during inference
        outputs = model(image)
        softmax = torch.nn.Softmax(dim=1)  # Apply Softmax to get probabilities
        probabilities = softmax(outputs)

        # Get the predicted class
        predicted_class = torch.argmax(probabilities, dim=1).item()
        prediction_prob = probabilities[0][predicted_class].item()

    # Output the prediction class and its probability
    return predicted_class, prediction_prob

# Example usage:
image_path = test_df.iloc[9]['image_path']  # Specify your image path here
predicted_class, prediction_prob = predict_image(image_path)
print(f"Predicted class: {predicted_class}, Probability: {prediction_prob}")

Predicted class: 0, Probability: 0.9989749193191528


In [7]:
test_df.head(20)

Unnamed: 0,image_path,label
0,dataset/Dataset/Healthy/IMG_9940.JPG,healthy
1,dataset/Dataset/Healthy/IMG_1511(0).JPG,healthy
2,dataset/Dataset/Unhealthy/IMG_5535.JPG,unhealthy
3,dataset/Dataset/Healthy/IMG_1292(0).JPG,healthy
4,dataset/Dataset/Unhealthy/IMG_5471.JPG,unhealthy
5,dataset/Dataset/Healthy/IMG_4557.jpeg,healthy
6,dataset/Dataset/Healthy/IMG_5468.JPG,healthy
7,dataset/Dataset/Healthy/IMG_8006.JPG,healthy
8,dataset/Dataset/Unhealthy/IMG_2354.JPG,unhealthy
9,dataset/Dataset/Healthy/IMG_0883(0).JPG,healthy


In [12]:
import os
import pandas as pd
import shutil

# Step 1: Load the test dataset
test_df = pd.read_csv('test_dataset.csv')

# Step 2: Create a folder to save the test samples
sample_folder = 'test_samples'
os.makedirs(sample_folder, exist_ok=True)

# Assuming the dataset has a 'class' column that indicates the class (e.g., 0, 1, etc.)
# If your dataset has different class labels, replace 'class' with the correct column name.

# Step 3: Split the dataset into two folders based on class
classes = test_df['label'].unique()  # Get unique class labels

# Create subfolders for each class inside 'test_samples'
for class_label in classes:
    class_folder = os.path.join(sample_folder, str(class_label))
    os.makedirs(class_folder, exist_ok=True)

# Step 4: Select 10 random samples from each class
sample_size_per_class = 10

for class_label in classes:
    # Filter the dataset for the current class
    class_df = test_df[test_df['label'] == class_label]
    
    # Randomly sample 10 rows from the current class
    sampled_class_df = class_df.sample(n=sample_size_per_class, random_state=42)

    # Step 5: Copy the corresponding images to the class-specific folder
    for index, row in sampled_class_df.iterrows():
        image_path = row['image_path']  # Adjust the column name if needed
        image_name = os.path.basename(image_path)  # Get the image file name

        # Check if the image file exists before copying
        if os.path.exists(image_path):
            destination_path = os.path.join(sample_folder, str(class_label), image_name)
            shutil.copy(image_path, destination_path)
            print(f"Copied {image_name} to {destination_path}")
        else:
            print(f"Image {image_path} not found!")

print(f"Successfully saved {sample_size_per_class} samples from each class to {sample_folder}")


KeyError: 'class'