# X-Ray Pneumonia Logistic Regression

### 1a) Feature Extraction

In [1]:
import torch
import os
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import numpy as np
import pandas as pd

In [2]:
# Define a function to extract and flatten features using the VGG16 model
def extract_features(data_loader, model, device):
    features = []
    labels = []
    
    # Set the model to evaluation mode
    model.eval()
    
    with torch.no_grad():  # Disable gradient calculations
        for images, targets in data_loader:
            images = images.to(device)
            
            # Forward pass through the VGG16 model to extract features
            outputs = model(images)
            
            # Flatten the features (batch size, -1)
            outputs_flat = outputs.view(outputs.size(0), -1)
            
            # Collect the flattened features and corresponding labels
            features.append(outputs_flat.cpu().numpy())
            labels.append(targets.numpy())
    
    # Concatenate the feature and label arrays
    features = np.concatenate(features, axis=0)
    labels = np.concatenate(labels, axis=0)
    
    return features, labels

In [3]:
# Device configuration (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained VGG16 model
vgg16 = models.vgg16(pretrained=True)

# Remove the classifier part of VGG16 (keep only the convolutional layers)
vgg16_features = vgg16.features  # This includes only the convolutional layers, no fully connected layers
vgg16_features = vgg16_features.to(device)

# Define the image transformations (resizing, normalization, etc.)
img_size = (224, 224)
transform = transforms.Compose([
    transforms.Resize(img_size),  # Resize images to the desired size (32x32, 64x64, 128x128)
    transforms.ToTensor(),        # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize to match VGG16's training
])

# Test feature extraction and flatten the result
with torch.no_grad():
    sample_image = torch.randn(1, 3, img_size[0], img_size[1]).to(device)  # Create a dummy image of the specified size
    features = vgg16_features(sample_image)  # Extract features using convolutional layers
    print(f"Original feature shape for {img_size}: {features.shape}")

    # Flatten the feature map (batch size is 1, so we flatten from index 1 onward)
    features_flat = features.view(features.size(0), -1)  # Flatten to (num_samples, num_features)
    print(f"Flattened feature shape for {img_size}: {features_flat.shape}")




Original feature shape for (224, 224): torch.Size([1, 512, 7, 7])
Flattened feature shape for (224, 224): torch.Size([1, 25088])


In [4]:
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),  # ResNet expects 224x224 images
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalization for pre-trained models
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = '../data/chest_xray'
image_datasets = {
    'train': datasets.ImageFolder(os.path.join(data_dir, 'train'), data_transforms['train']),
    'val': datasets.ImageFolder(os.path.join(data_dir, 'val'), data_transforms['val']),
    'test': datasets.ImageFolder(os.path.join(data_dir, 'test'), data_transforms['test'])
}

dataloaders = {
    'train': DataLoader(image_datasets['train'], batch_size=32, shuffle=True),
    'val': DataLoader(image_datasets['val'], batch_size=32, shuffle=False),
    'test': DataLoader(image_datasets['test'], batch_size=32, shuffle=False)
}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes

In [5]:
# Extract features and labels using the VGG16 model for each dataset split
X_train, y_train = extract_features(dataloaders['train'], vgg16, device)
X_val, y_val = extract_features(dataloaders['val'], vgg16, device)
X_test, y_test = extract_features(dataloaders['test'], vgg16, device)

In [6]:
# Print the shape of the extracted features
print(f"Extracted features shape: {X_train.shape[1:]}")
print(f"Labels shape: {y_train.shape}")

Extracted features shape: (1000,)
Labels shape: (5216,)


### 1b) Feature Extraction

In [7]:
# Import necessary libraries
import numpy as np  # For numerical computations and array manipulation
from time import time  # For measuring performance time
from sklearn.linear_model import LogisticRegressionCV  # Logistic Regression with cross-validation
from sklearn.model_selection import train_test_split  # For splitting data

# Define regularization hyper-parameter (inverse of alpha)
alpha = 10**np.linspace(-10, 10, 21)
C = 1 / alpha

# Logistic Regression with Cross-Validation
lgCV = LogisticRegressionCV(Cs=C, n_jobs=-1, max_iter=10000)

# Fit logistic regressor using the flattened training data
time_start = time()
lgCV.fit(X_train, y_train)  # Fit on flattened training data
time_stop = time()
time_elapsed = time_stop - time_start
print('Elapsed time:', round(time_elapsed / 60, 1), 'min')

# Error rate calculation function
def ErrorRate(y_true, y_pred):
    return (y_true != y_pred).mean()

# Compute training and test error rates
print('Alpha (1 / lgCV.C_):', (1 / lgCV.C_)[0])

# Predictions
y_pred_train = lgCV.predict(X_train)  # Predictions on flattened training data
y_pred_test = lgCV.predict(X_test)    # Predictions on flattened test data

# Error rates
print('Training error rate:', ErrorRate(y_train, y_pred_train).round(3))
print('Testing error rate:', ErrorRate(y_test, y_pred_test).round(3))
print('Training Accuracy:', ErrorRate(y_train, y_pred_train).round(3))
print('Test Accuracy:', 1 - ErrorRate(y_test, y_pred_test).round(3))

Elapsed time: 1.3 min
Alpha (1 / lgCV.C_): 1.0
Training error rate: 0.008
Testing error rate: 0.196
Training Accuracy: 0.008
Test Accuracy: 0.804
