In [None]:
# install requirments
# It is run on Colab T4 GPU
!pip install timm
!pip install kaggle
!pip install datasets
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install Pillow

In [None]:
# import library
import sys
import timm
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import random
from tqdm.auto import tqdm
import time
import warnings
# warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torchvision.models import resnet50, ResNet50_Weights, densenet121, DenseNet121_Weights

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# set seed and device
def set_seed(seed=228):
  '''It is the final project of ECE228. So I choose 228 to be the seed.'''
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

In [None]:
# download dataset
# Kaggle key is needed
def download_dataset():
    if not os.path.exists("IMAGES"):
        print("Downloading dataset...")
        from google.colab import files

        print("Please upload your kaggle.json file:")
        uploaded = files.upload()

        os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
        os.system("cp kaggle.json ~/.kaggle/")
        os.system("chmod 600 ~/.kaggle/kaggle.json")

        os.system("kaggle datasets download -q adarshrouniyar/air-pollution-image-dataset-from-india-and-nepal")

        # Extract dataset based on its dir tree
        os.system('unzip -q air-pollution-image-dataset-from-india-and-nepal.zip "Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/IND_and_NEP/*"')
        os.system('mv "Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/IND_and_NEP" .')
        os.system('mv IND_and_NEP IMAGES')
        os.system('rm -r "Air Pollution Image Dataset/" air-pollution-image-dataset-from-india-and-nepal.zip')

        print("Dataset downloaded and organized!")
    else:
        print("Dataset already exists!")


download_dataset()

# Check dataset structure
classes = sorted(os.listdir("IMAGES"))
print(f"Found classes: {classes}")
num_classes = len(classes)

# Show samples per class
for class_name in classes:
    class_path = os.path.join("IMAGES", class_name)
    num_samples = len(os.listdir(class_path))
    print(f"{class_name}: {num_samples} samples")

In [None]:
# preprocess and load data

def get_transforms(image_size=224):
    """0.5 is a not bad choice to normalize data when we do not know dataset well"""
    train_transforms = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(10),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])

    val_transforms = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])

    return train_transforms, val_transforms

def create_data_loaders(image_size=224, batch_size=32):
    """Create train and validation data loaders"""
    train_transforms, val_transforms = get_transforms(image_size)

    # Load dataset
    full_dataset = ImageFolder("IMAGES")
    dataset_size = len(full_dataset)
    indices = list(range(dataset_size))
    targets = [full_dataset.targets[i] for i in indices]

    # Stratified split, still use 228 to be random seed
    train_indices, val_indices = train_test_split(
        indices, test_size=0.2, random_state=228, stratify=targets
    )

    train_dataset = ImageFolder("IMAGES", transform=train_transforms)
    val_dataset = ImageFolder("IMAGES", transform=val_transforms)

    train_subset = Subset(train_dataset, train_indices)
    val_subset = Subset(val_dataset, val_indices)

    # Create data loaders
    # Note: you may need to modify parameter to run faster.
    # Colab may have some issues when setting pin_memory True
    train_loader = DataLoader(
        train_subset, batch_size=batch_size, shuffle=True,
        num_workers=0, pin_memory=False
    )
    val_loader = DataLoader(
        val_subset, batch_size=batch_size, shuffle=False,
        num_workers=0, pin_memory=False
    )

    print(f"Training samples: {len(train_subset)}")
    print(f"Validation samples: {len(val_subset)}")

    return train_loader, val_loader, classes

In [None]:
# define models: ResNet50, DenseNet121, EfficientNet-B0, Vision Transformer
class AirPollutionResNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.resnet = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
        in_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.resnet(x)

class AirPollutionDenseNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.densenet = densenet121(weights=DenseNet121_Weights.IMAGENET1K_V1)
        in_features = self.densenet.classifier.in_features
        self.densenet.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.densenet(x)

class AirPollutionEfficientNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.efficientnet = timm.create_model('efficientnet_b0', pretrained=True)
        in_features = self.efficientnet.classifier.in_features
        self.efficientnet.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.efficientnet(x)

class AirPollutionViT(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=True)
        in_features = self.vit.head.in_features
        self.vit.head = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.vit(x)

In [None]:
# functions used during training
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in tqdm(dataloader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / total
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Validation"):
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    epoch_loss = running_loss / total
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc, all_preds, all_labels

def train_model(model, train_loader, val_loader, model_name, num_epochs=15):
    print(f"\n{'='*60}")
    print(f"Training {model_name}")
    print(f"{'='*60}")

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, verbose=True
    )

    best_val_acc = 0.0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)

        val_loss, val_acc, val_preds, val_labels = validate_epoch(model, val_loader, criterion, device)

        scheduler.step(val_loss)

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'best_{model_name.lower()}_model.pth')
            print(f"New best model saved! Accuracy: {best_val_acc:.2f}%")

        # Store history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        # Print results
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

    return model, history, best_val_acc

def plot_training_history(history, model_name):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Train Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title(f'{model_name} - Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history['train_acc'], label='Train Accuracy')
    plt.plot(history['val_acc'], label='Validation Accuracy')
    plt.title(f'{model_name} - Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.show()

def evaluate_model(model, val_loader, model_name, classes):
    """Comprehensive model evaluation"""
    print(f"\n{'='*60}")
    print(f"Evaluating {model_name}")
    print(f"{'='*60}")

    criterion = nn.CrossEntropyLoss()
    _, val_acc, val_preds, val_labels = validate_epoch(model, val_loader, criterion, device)

    print(f"Final Validation Accuracy: {val_acc:.2f}%")

    # Classification report
    print("\nClassification Report:")
    print(classification_report(val_labels, val_preds, target_names=classes, digits=4))

    # Confusion matrix
    cm = confusion_matrix(val_labels, val_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

    return val_acc

In [None]:
# main pipeline
def main():
    print("Starting Air Pollution Detection Training Pipeline")
    print(f"Device: {device}")

    # Load data
    train_loader, val_loader, classes = create_data_loaders(image_size=224, batch_size=32)

    # Define the models used
    models_to_train = [
        ("ResNet50", AirPollutionResNet),
        ("DenseNet121", AirPollutionDenseNet),
        ("EfficientNet-B0", AirPollutionEfficientNet),
        ("ViT", AirPollutionViT)
    ]
    results = {}

    for model_name, model_class in models_to_train:
        try:
            print(f"\n{'='*80}")
            print(f"TRAINING {model_name}")
            print(f"{'='*80}")

            model = model_class(num_classes).to(device)
            param_count = sum(p.numel() for p in model.parameters())
            print(f"Model has {param_count:,} parameters")

            model, history, best_acc = train_model(
                model, train_loader, val_loader, model_name, num_epochs=15
            )

            plot_training_history(history, model_name)

            model.load_state_dict(torch.load(f'best_{model_name.lower()}_model.pth'))
            final_acc = evaluate_model(model, val_loader, model_name, classes)

            results[model_name] = {
                'best_accuracy': best_acc,
                'final_accuracy': final_acc,
                'history': history,
                'parameters': param_count
            }

            torch.save({
                'model_state_dict': model.state_dict(),
                'model_class': model_class.__name__,
                'classes': classes,
                'best_accuracy': best_acc,
                'history': history,
                'parameters': param_count
            }, f'{model_name.lower()}_final.pth')

            print(f"{model_name} training completed!")

        except Exception as e:
            print(f"Error training {model_name}: {str(e)}")
            continue

    # After training all models, we are about to compare them
    print("\n" + "="*80)
    print("FINAL COMPARISON OF ALL MODELS")
    print("="*80)

    comparison_data = []
    for model_name, result in results.items():
        comparison_data.append({
            'Model': model_name,
            'Best Accuracy (%)': f"{result['best_accuracy']:.2f}",
            'Final Accuracy (%)': f"{result['final_accuracy']:.2f}",
            'Parameters': f"{result['parameters']:,}"
        })

    df = pd.DataFrame(comparison_data)
    print(df.to_string(index=False))

    if results:
        plt.figure(figsize=(12, 6))

        model_names = list(results.keys())
        best_accs = [results[name]['best_accuracy'] for name in model_names]
        final_accs = [results[name]['final_accuracy'] for name in model_names]

        x = np.arange(len(model_names))
        width = 0.35

        plt.bar(x - width/2, best_accs, width, label='Best Validation Accuracy', alpha=0.8)
        plt.bar(x + width/2, final_accs, width, label='Final Validation Accuracy', alpha=0.8)

        plt.xlabel('Models')
        plt.ylabel('Accuracy (%)')
        plt.title('Model Comparison - Air Pollution Detection')
        plt.xticks(x, model_names)
        plt.legend()
        plt.grid(axis='y', alpha=0.3)

        for i, (best, final) in enumerate(zip(best_accs, final_accs)):
            plt.text(i - width/2, best + 0.5, f'{best:.1f}%', ha='center', va='bottom')
            plt.text(i + width/2, final + 0.5, f'{final:.1f}%', ha='center', va='bottom')

        plt.tight_layout()
        plt.show()

    print("\nTraining pipeline completed!")
    return results

# Run the main pipeline
if __name__ == "__main__":
    results = main()