# Prepare Data

## Setup

Create a conda environment to do the CLIP embedding.
This won't be used after the vectors are generated.

To install the conda environment, run:
```shell
source /opt/conda/bin/activate
conda create --yes -n clip
conda activate clip
conda install --yes -c pytorch torchvision cudatoolkit ipykernel pandas pyarrow
pip install git+https://github.com/openai/CLIP.git
```

Install instructions from https://github.com/openai/CLIP/tree/main#usage

To add a conda environment as a kernel for jupyter, run:
```shell
conda activate clip
conda install ipykernel
python -m ipykernel install --user --name clip --display-name clip
```
Then in the top right corner of the notebook, switch the kernel to `clip`.

To list the installed kernels (requires `jupyter` be installed), run:
```shell
jupyter kernelspec list
```
To remove an installed kernel, run:
```shell
jupyter kernelspec uninstall clip
```

In [None]:
import gzip
import tarfile
import urllib.request
from itertools import islice
from pathlib import Path

import clip
import numpy as np
import pandas as pd
import pyarrow as pa
import torch
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

In [None]:
def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch

In [None]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device: {device}")

In [None]:
clip.available_models()

In [None]:
model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:",
      f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

In [None]:
data_path = Path("data/")

## MNIST

The [MNIST Dataset](http://yann.lecun.com/exdb/mnist/) contains 70,000 images of handwritten digits divided across 10 classes. Each class contains roughly 7,000 images. It is further divided by writer into 60,000 train and 10,000 test examples.

```
@misc{mnist,
  title={The MNIST database of handwritten digits},
  author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
  howpublished={\url{http://yann.lecun.com/exdb/mnist/}},
  year={1998},
}
```

### Download

In [None]:
filenames = (
    "train-images-idx3-ubyte.gz",
    "train-labels-idx1-ubyte.gz",
    "t10k-images-idx3-ubyte.gz",
    "t10k-labels-idx1-ubyte.gz",
)

In [None]:
# http://yann.lecun.com/exdb/mnist/
# has blacklisted the default urllib user agent.
# https://github.com/pytorch/vision/issues/3497#issuecomment-789996883
# Use the PyTorch mirror instead.
for filename in filenames:
    url = f"https://storage.googleapis.com/cvdf-datasets/mnist/{filename}"
    _ = urllib.request.urlretrieve(url, str(data_path / "mnist" / filename))

In [None]:
image_size = 28
images = dict()
labels = dict()
n_examples = {"train": 60_000, "test": 10_000}
for filename in filenames:
    if filename.startswith("train"):
        split = "train"
    elif filename.startswith("t10k"):
        # Are test examples.
        split = "test"
    with gzip.open(data_path / "mnist" / filename, "rb") as file:
        if "images" in filename:
            # Skip header.
            file.read(16)
            buffer = file.read(image_size * image_size * n_examples[split])
            images[split] = np.frombuffer(
                buffer,
                dtype=np.uint8,
            ).reshape(n_examples[split], image_size, image_size)
        elif "labels" in filename:
            # Skip header.
            file.read(8)
            buffer = file.read(n_examples[split])
            labels[split] = np.frombuffer(buffer, dtype=np.uint8)

In [None]:
df_test = pd.DataFrame({
    "class": labels["test"],
    "split": "test",
    "image": [image for image in images["test"]],
})
df_train = pd.DataFrame({
    "class": labels["train"],
    "split": "train",
    "image": [image for image in images["train"]],
})

In [None]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)

In [None]:
df["image"] = df["image"].apply(lambda x: np.array(x, dtype=np.uint8))

In [None]:
df.groupby("class").size()

### Embed

In [None]:
clip_vectors = []
n = 64
total = len(df["image"]) // n + (len(df["image"]) % n > 0)
for batch in tqdm(batched(df["image"], n), total=total):
    image_input = [preprocess(Image.fromarray(x, mode="L")) for x in batch]
    image_features = model.encode_image(
        torch.stack(image_input).to(device)).detach().numpy()
    clip_vectors.append(image_features)
clip_vectors = np.vstack(clip_vectors)

In [None]:
df["clip"] = clip_vectors.tolist()

In [None]:
df["clip_norm"] = (
    clip_vectors /
    np.linalg.norm(clip_vectors, axis=1, keepdims=True)).tolist()

In [None]:
df["image"] = df["image"].apply(lambda x: x.reshape((28, 28)).tolist())

In [None]:
df.to_parquet(data_path / "mnist" / "mnist.parquet")

###

PyTorch expects tensors to be in channels first format, unlike TensorFlow which uses channels last. https://pytorch.org/blog/tensor-memory-format-matters/

`preprocess` takes care of grayscale to RGB conversion, channel first formatting, and ensuring the image size is larger than the kernel size of (32 x 32).

In [None]:
def grayscale_to_rgb(image: np.ndarray):
    """Converts a grayscale image to RGB.

    Repeats the image over all 3 channels like
    tensorflow.image.grayscale_to_rgb .
    The input images' last dimension must be size 1.
    """
    return np.repeat(image, 3, 0)

In [None]:
plt.figure()
plt.imshow(df.loc[0, "image"], cmap="gray")
plt.title(f'Digit: {df.loc[0, "class"]}')
plt.show()

In [None]:
plt.figure()
plt.imshow(preprocess(Image.fromarray(df.loc[0, "image"], mode="L")).numpy()[1], cmap="gray")
plt.title(f'Digit: {df.loc[0, "class"]}')
plt.show()

## Pets

The [Oxford-IIIT Pet Dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/) contains 7,390 images divided across 37 classes of cat and dog breeds. Each class contains roughly 200 images.

```
@misc{pets,
  title={The Oxford-IIIT PET Dataset},
  author={Parkhi, Omkar M and Vedaldi, Andrea and Zisserman, Andrew and Jawahar, CV},
  howpublished={\url{https://www.robots.ox.ac.uk/~vgg/data/pets/}},
  year={2012},
}
```

### Download

In [None]:
if not (data_path / "pets/annotations/").exists():
    urllib.request.urlretrieve(
        "https://thor.robots.ox.ac.uk/~vgg/data/pets/annotations.tar.gz",
        str(data_path / "pets" / "annotations.tar.gz"))

    tar = tarfile.open(str(data_path / "pets" / "annotations.tar.gz"), "r:gz")
    tar.extractall(path=str(data_path / "pets"))
    tar.close()

In [None]:
if not (data_path / "pets/images/").exists():
    urllib.request.urlretrieve(
        "https://thor.robots.ox.ac.uk/~vgg/data/pets/images.tar.gz",
        str(data_path / "pets" / "images.tar.gz"))

    tar = tarfile.open(str(data_path / "pets" / "images.tar.gz", "r:gz")
    tar.extractall(path=str(data_path / "pets"))
    tar.close()

In [None]:
df_annotations = pd.read_csv(
    data_path / "pets" / "annotations" / "list.txt",
    sep=" ",
    comment="#",
    names=["image", "class_id", "species_id", "breed_id"],
)

In [None]:
df_annotations["class"] = (
    df_annotations["image"].str.split("_").str[:-1].str.join("_"))

In [None]:
class_labels = df_annotations[["class", "class_id"]].drop_duplicates()

In [None]:
label_encoder = {
    row["class_id"]: row["class"] for _, row in class_labels.iterrows()
}

`class_id` is a global unique class id from 1 to 37, `species_id` is either 1 for cat or 2 for dog, `breed_id` is a class id that is only unique given the species.

### Embed

In [None]:
image_paths = list((data_path / "pets" / "images").glob("*.jpg"))

In [None]:
clip_vectors = []
n = 64
total = len(image_paths) // n + (len(image_paths) % n > 0)
for batch in tqdm(batched(image_paths, n), total=total):
    image_input = [preprocess(Image.open(x)) for x in batch]
    image_features = model.encode_image(
        torch.stack(image_input).to(device)).detach().numpy()
    clip_vectors.append(image_features)
clip_vectors = np.vstack(clip_vectors)

In [None]:
df = pd.DataFrame({
    "path": [str(x) for x in image_paths],
    "image": [x.stem for x in image_paths]
})

In [None]:
df["clip"] = clip_vectors.tolist()

In [None]:
df["clip_norm"] = (
    clip_vectors /
    np.linalg.norm(clip_vectors, axis=1, keepdims=True)).tolist()

In [None]:
df["class"] = df["image"].str.split("_").str[:-1].str.join("_")

In [None]:
df["species"] = df["class"].str[0].str.isupper().map({
    True: "cat",
    False: "dog",
})

In [None]:
df["image_n"] = df["image"].str.split("_").str[-1].astype(int)

In [None]:
df = df.sort_values(["class", "image_n"]).reset_index(drop=True)

In [None]:
df.to_parquet(data_path / "pets" / "pets_clip.parquet")

### Incomplete Annotations

Some images were not listed in the given annotations. The given annotations were discarded and complete ones built instead.

In [None]:
print(f"Number of annotations: {df_annotations.shape[0]}")
print(f"Number of images: {clip_vectors.shape[0]}")

In [None]:
n_test = (data_path / "pets/annotations/test.txt").read_text().count("\n")
n_train = (data_path / "pets/annotations/trainval.txt").read_text().count("\n")
print(n_test, n_train, n_test + n_train)

In [None]:
df.loc[~df["image"].isin(df_annotations["image"]), "image"].shape[0]

In [None]:
df_annotations.loc[~df_annotations["image"].isin(df["image"]), "image"]

In [None]:
df_annotations.groupby("class").size().sort_values()

In [None]:
df.groupby("class").size().sort_values()