# Image Classification on Galileo 🔭

In [None]:
try:
    import dataquality as dq
except ImportError:
    # Upgrade pip
    !pip install -U pip &> /dev/null

    # Install all dependecies
    !pip install -U dataquality torch torchvision datasets &> /dev/null

    print('👋 Installed necessary libraries and restarting runtime! This should only need to happen once.')
    print('🙏 Continue with the rest of the notebook or hit "Run All" again!')

    # Restart the runtime
    import os, time
    time.sleep(1) # gives the print statements time to flush
    os._exit(0) # exits without allowing the next cell to run

In [2]:
import os
from typing import Optional, List

import random
import numpy as np
import torch

# Random Seeds.
def seed_all(seed: int) -> None:
    """Set all relevant seed for training a Pytorch Model.

    Based on the following post:
    https://discuss.pytorch.org/t/reproducibility-with-all-the-bells-and-whistles/81097
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def seed_worker(worker_id: int) -> None:
    """Set seed for dataloader worker.

    Based on the following post:
    https://discuss.pytorch.org/t/reproducibility-with-all-the-bells-and-whistles/81097
    """
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# Check Cuda.
print(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# A small function for minimizing the dataset for testing purposes
import os

def _minimize_for_ci() -> bool:
    return os.getenv("MINIMIZE_FOR_CI", "false") == "true"

torch.cuda.is_available(): False


# Connect to Galileo

In [3]:
import os

import dataquality as dq

dq.init(
      task_type="image_classification",
      project_name="Image Classification Example",
      run_name="BEANS",
  )

dataquality version is v0.8.13

✨ Initializing existing public project 'Image Classification Example'
🏃‍♂️ Fetching existing run 'BEANS'
🛰 Connected to existing project 'Image Classification Example', and existing run 'BEANS'.




In [4]:
#
# Create your dataset
#

import os
import torch
from torchvision.transforms import Compose
from torchvision import transforms
import pandas as pd
from PIL import Image
from typing import Dict, Any, List, Optional


def find_label_col_name(col_names: List[str]) -> Optional[str]:
    for col_name in col_names:
        if "label" in col_name:
            return col_name
    return None


def find_imgs_location_col_name(col_names: List[str]) -> Optional[str]:
    for col_name in col_names:
        if "path" in col_name:
            return col_name
    return None


def _get_imgs_dir() -> str:
    d = f"{os.environ['HOME']}/.cache/huggingface/datasets/beans/default/0.0.0"
    for p in list(os.walk(d)):
        if p[0] != d and os.path.isdir(p[0]):
            return (p[0])
    raise Exception("Images directory not found. Did the dataset download correctly?")


STANDARD_DATA_COLUMNS_CV = ["id", "text", "label_idx"]


class ImageDatasetFromCSV(torch.utils.data.Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        imgs_dir: str,
        split: str,
        transform: Optional[Compose] = None,
    ):
        self.ds = df
        self.imgs_dir = imgs_dir
        self.transform = transform

        # Find the id column, or create it if it doesn't exist.
        if "id" not in self.ds.columns:
            self.ds = self.ds.reset_index().rename(columns={"index": "id"})

        # Find the label column name: could be label, labels, coarse_label, etc.
        self.label_col_name = find_label_col_name(self.ds.columns)
        if self.label_col_name is None:
            raise ValueError("Could not find the label column in the dataframe")
        STANDARD_DATA_COLUMNS_CV.append(self.label_col_name)
        self.list_of_labels = list(self.ds[self.label_col_name].unique())

        # Convert string labels to indexes, store them in the column label_idx.
        str_to_int = {
            label: index
            for index, label in enumerate(self.ds[self.label_col_name].unique())
        }
        self.ds["label_idx"] = self.ds[self.label_col_name].map(str_to_int)

        # Find the images paths column name: could be path, rel_path, imgs_path, etc.
        self.imgs_location_colname = find_imgs_location_col_name(self.ds.columns)
        if self.imgs_location_colname is None:
            raise ValueError(
                "Could not find the images location column in the dataframe"
            )
        STANDARD_DATA_COLUMNS_CV.append(self.imgs_location_colname)

        # Get the metadata columns.
        meta_data_cols = [
            column
            for column in self.ds.columns
            if column not in STANDARD_DATA_COLUMNS_CV
        ]

        # 🔭🌕 Galileo logging -- Input Data
        dq.log_image_dataset(
            dataset=self.ds,
            label="labels",
            split=split,
            meta=meta_data_cols,
            imgs_dir=self.imgs_dir,
            imgs_location_colname=self.imgs_location_colname,
        )

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        img_path = os.path.join(
            self.imgs_dir, self.ds.loc[idx, self.imgs_location_colname]
        )
        image = Image.open(img_path)
        id = self.ds.loc[idx, "id"]
        label = self.ds.loc[idx, "label_idx"]

        if self.transform is not None:
            image = self.transform(image)

        return {"image": image, "label": label, "id": id}

    def __len__(self) -> int:
        return len(self.ds)  # type: ignore

# Load Data and Create a Dataset

In [None]:
from datasets import load_dataset
from datasets import Image as datasetsImage

import numpy as np

dataset_train = load_dataset("beans", split="train").cast_column("image", datasetsImage(decode=False))
dataset_test = load_dataset("beans", split="test").cast_column("image", datasetsImage(decode=False))

train_labels = dataset_train.features["labels"].names
train_df = dataset_train.to_pandas()
train_df["labels"] = train_df["labels"].map(lambda x: train_labels[x])

test_labels = dataset_test.features["labels"].names
test_df = dataset_test.to_pandas()
test_df["labels"] = test_df["labels"].map(lambda x: test_labels[x])

train_df = train_df.reset_index()
train_df["index"] = np.arange(len(train_df))
test_df = test_df.reset_index()
test_df["index"] = np.arange(len(test_df))

if _minimize_for_ci():
    train_df = train_df[:10]
    test_df = test_df[test_df.labels.isin(train_df.labels.unique())][:10]

data_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ]
)

imgs_dir = _get_imgs_dir()

train_dataset = ImageDatasetFromCSV(
    df=train_df, imgs_dir=imgs_dir, split="train", transform=data_transform
)
test_dataset = ImageDatasetFromCSV(
    df=test_df, imgs_dir=imgs_dir, split="test", transform=data_transform
)

print(f"Loaded train dataset with {len(train_dataset.ds)} samples and {len(train_dataset.list_of_labels)} labels")
print(f"Loaded val dataset with {len(test_dataset.ds)} samples and {len(test_dataset.list_of_labels)} labels")

dq.set_labels_for_run(train_dataset.list_of_labels)

# Create the dataloaders and model

In [10]:
from torchvision.models import resnet50
from dataquality.integrations.torch import watch

# Some global HP.
BATCH_SIZE = 128

EPOCHS = 4
if _minimize_for_ci():
    EPOCHS = 1

# Create data loaders.
NUM_WORKERS = 0
SEED_WORKER = 42

seed_all(SEED_WORKER)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    worker_init_fn=seed_worker,
    pin_memory=True
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    worker_init_fn=seed_worker,
    pin_memory=True
)

model = resnet50(pretrained=True)

# Load model and replace last layer.
model = resnet50(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, len(train_dataset.list_of_labels))
torch.nn.init.xavier_uniform_(model.fc.weight)

model = model.to(device)

# Set optimizer and loss.
params_1x = [  # get the original weights, they'll be updated with a lower learning rate
    param
    for name, param in model.named_parameters()
    if "fc" not in str(name)
]
lr, weight_decay = 1e-5, 5e-4
optimizer = torch.optim.Adam(
    [
        {"params": params_1x, "lr": lr},
        {"params": model.fc.parameters(), "lr": lr * 10},
    ],
    weight_decay=weight_decay,
)
criterion = torch.nn.CrossEntropyLoss()

watch(
    model=model,
    classifier_layer=model.fc,
    dataloaders=[train_dataloader, test_dataloader],
    unpatch_on_start=False
)

Attaching dataquality to model and dataloaders


# Train the model!

In [11]:
from tqdm import tqdm
from time import sleep, time

# Train !
start = time()
print(f"Training for {EPOCHS} epochs on {device}")

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")
    dq.set_epoch(epoch)

    model.train()
    train_loss = torch.tensor(0.0, device=device)
    train_correct = torch.tensor(0, device=device)
    
    dq.set_split("train")
    with tqdm(train_dataloader, unit="batch") as train_minibatchs:
        for train_minibatch in train_minibatchs:
            train_minibatchs.set_description(f"Epoch {epoch}")

            images = train_minibatch["image"].to(device)
            labels = train_minibatch["label"].to(device)

            preds = model(images)
            loss = criterion(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            train_loss += loss
            train_batch_correct = (torch.argmax(preds, dim=1) == labels).sum()
            train_correct += train_batch_correct

        train_minibatchs.set_postfix(batch_loss=loss.item(), batch_accuracy=float(train_batch_correct) / BATCH_SIZE)
        sleep(0.01)

    print(f"Training loss: {train_loss:.2f}")
    print(f"Training accuracy: {100 * float(train_correct) / len(train_dataloader.dataset):.2f}")
    
    dq.set_split("test")
    if test_dataloader is not None:
        model.eval()
        val_loss = torch.tensor(0.0, device=device)
        val_correct = torch.tensor(0, device=device)

        with torch.no_grad():
            for val_minibatch in tqdm(test_dataloader):
                images = val_minibatch["image"].to(device)
                labels = val_minibatch["label"].to(device)
                
                preds = model(images)
                loss = criterion(preds, labels)

                val_loss += loss
                val_correct += (torch.argmax(preds, dim=1) == labels).sum()

        print(f"Validation loss: {val_loss:.2f}")
        print(f"Validation accuracy: {100*val_correct/len(test_dataloader.dataset):.2f}")

end = time()
print(f"Total training time: {end-start:.1f} seconds")
dq.finish()
print("done!")

Training for 1 epochs on cpu
Epoch 1/1


Epoch 1: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [04:28<00:00, 29.79s/batch]


Training loss: 0.71
Training accuracy: 0.58


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.77s/it]


Validation loss: 0.85
Validation accuracy: 64.06
Total training time: 277.0 seconds
☁️ Uploading Data


training:   0%|          | 0/1 [00:00<?, ?it/s]

Processing data for upload:   0%|          | 0/9 [00:00<?, ?it/s]

training (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/8.10M [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/47.9k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/183M [00:00<?, ?B/s]

test:   0%|          | 0/1 [00:00<?, ?it/s]

Processing data for upload:   0%|          | 0/1 [00:00<?, ?it/s]

test (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Job default successfully submitted. Results will be available soon at https://console.cloud.rungalileo.io/insights?projectId=736c3b50-458c-4b11-8232-f771e27ed0d4&runId=b906bc90-d7aa-4d3c-a1a9-a4ec22b7777b&split=training&metric=f1&depHigh=1&depLow=0&taskType=3
Waiting for job...
	Found embs. Analyzing dimensions
	Applying dimensionality reduction to embs
	Looking for data embeddings
	No data embs found, skipping processing
	Saving processed training data
	Calculating test data error potential
	Saving processed test data
Done! Job finished with status completed
Click here to see your run! https://console.cloud.rungalileo.io/insights?projectId=736c3b50-458c-4b11-8232-f771e27ed0d4&runId=b906bc90-d7aa-4d3c-a1a9-a4ec22b7777b&split=training&metric=f1&depHigh=1&depLow=0&taskType=3
🧹 Cleaning up
🧹 Cleaning up
done!
