In [1]:
import math
from pytorch_lightning.loggers import WandbLogger
from tqdm import tqdm
import numpy as np
import torch
import pytorch_lightning as pl
from filelock import FileLock
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
from torchvision.datasets import MNIST
from torchvision import transforms
import os
import torchvision
import clip
from clip_lt.utils.labels_names import labels_names
import torch.nn as nn
from PIL import Image
import random



In [2]:
wandb_logger = WandbLogger()

[34m[1mwandb[0m: Currently logged in as: [33mrotem98[0m (use `wandb login --relogin` to force relogin)


In [3]:
dataset_dir_path = '/Volumes/black_ssd/datasets/imagenet_lt/'
# dataset_dir_path = '/Users/rotemisraeli/Documents/datasets/imagenet_lt/'

class LT_Dataset(torch.utils.data.Dataset):
    def __init__(self, root, epoch_size = 128, transforms=None):
        self.root = root
        self.transform = transforms
        self.epoch_size = epoch_size
        self.images = []
        for i in range(1000):
            self.images.append(os.listdir(f'{root}{i}'))
    def __len__(self):
        return 1000*self.epoch_size

    def __getitem__(self, index):

        label = random.randint(0, 999)
        rand_image = random.randint(0,len(self.images[label])-1)
        # print(label, len(self.images),rand_image,len(self.images[label]))

        path = f'{self.root}{label}/{self.images[label][rand_image]}'

        with open(path, 'rb') as f:
            sample = Image.open(f).convert('RGB')

        if self.transform is not None:
            sample = self.transform(sample)
        return sample, label

TRANSFORM_IMG = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225] )
])


In [4]:


class ProjectionHead(nn.Module):
    def __init__(
        self,
        embedding_dim,
        projection_dim=1000,
        dropout=0.1
    ):
        super().__init__()
        self.projection = nn.Linear(embedding_dim, projection_dim)
        self.gelu = nn.GELU()
        self.fc = nn.Linear(projection_dim, projection_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(projection_dim)

    def forward(self, x):
        # print(x.shape)
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x


class LightningMNISTClassifier(pl.LightningModule):
    def __init__(self, config, data_dir=None):
        super(LightningMNISTClassifier, self).__init__()
        self.data_dir = data_dir
        self.lr = config['lr']
        self.batch_size = config['batch_size']
        self.clip_model, self.clip_preprocess = clip.load("ViT-B/32", device=self.device)
        self.text_features = torch.load('../text_features.pt')
        self.text_features = self.text_features / self.text_features.norm(dim=-1, keepdim=True)
        self.logit_scale = (nn.Parameter(torch.ones([]) * np.log(1 / 0.07))).exp()

        self.fc = ProjectionHead(512).to(self.device)

    def forward(self, x):
        with torch.no_grad():
            image_features = self.clip_model.encode_image(x)
        out = self.fc(image_features)
        out = out#.softmax(dim=-1)

        return out

    def old_forward(self, x):
        image_features = self.clip_model.encode_image(x)
        # normalized features
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        # cosine similarity as logits
        logits_per_image = self.logit_scale * image_features @ self.text_features.t()
        logits_per_text = logits_per_image.t()
        probs = logits_per_image#.softmax(dim=-1)
        # print(probs.shape,probs)
        return probs

    def cross_entropy_loss(self, logits, labels):
        return F.cross_entropy(logits, labels)

    def accuracy(self, logits, labels):
        _, predicted = torch.max(logits.data, 1)
        correct = (predicted == labels).sum().item()
        accuracy = correct / len(labels)
        return torch.tensor(accuracy)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)

        self.log("ptl/train_loss", loss)
        self.log("ptl/train_accuracy", accuracy,prog_bar=True)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)
        return {"val_loss": loss, "val_accuracy": accuracy}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
        self.log("ptl/val_loss", avg_loss)
        self.log("ptl/val_accuracy", avg_acc)

    def train_dataloader(self):
        train_transforms = transforms.Compose([
            transforms.RandomResizedCrop(size=(224,224)),
            transforms.TrivialAugmentWide(),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
        ])
        # train_data = torchvision.datasets.ImageFolder(self.data_dir+'train/',transform=self.clip_preprocess)
        train_data = LT_Dataset(self.data_dir+'train/',epoch_size=128,transforms=train_transforms)
        return DataLoader(train_data, batch_size=int(self.batch_size),num_workers=0,shuffle=True)

    def val_dataloader(self):
        train_data = torchvision.datasets.ImageFolder(self.data_dir+'val/',transform=self.clip_preprocess)
        # train_data = LT_Dataset(self.data_dir+'val/',epoch_size=8,transforms=self.clip_preprocess)
        return DataLoader(train_data, batch_size=int(self.batch_size),num_workers=0)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.fc.parameters(), lr=self.lr)
        return optimizer


def train_mnist(config):
    model = LightningMNISTClassifier(config,data_dir=dataset_dir_path)
    trainer = pl.Trainer(max_epochs=20, logger=wandb_logger)

    trainer.fit(model)

In [5]:
def train_mnist_no_tune():
    config = {
        "layer_1_size": 128,
        "layer_2_size": 256,
        "lr": 4e-3,
        "batch_size": 128
    }
    train_mnist(config)

In [None]:
train_mnist_no_tune()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type           | Params
----------------------------------------------
0 | clip_model | CLIP           | 151 M 
1 | fc         | ProjectionHead | 1.5 M 
----------------------------------------------
152 M     Trainable params
0         Non-trainable params
152 M     Total params
611.173   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [13]:
self.clip_preprocess

Error in callback <function _WandbInit._resume_backend at 0x7f8c0adf5f70> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

NameError: name 'self' is not defined

Error in callback <function _WandbInit._pause_backend at 0x7f8c0adf5e50> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

In [7]:
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
texts = []
for i in range(1000):
    label_name = labels_names[i].split(',')[0]
    if label_name[0] in 'aouie':
        texts.append(f'a photo of an {label_name}')
    else:
        texts.append(f'a photo of a {label_name}')

# texts2 = clip.tokenize(texts).to(device)
# with torch.no_grad():
#     text_features = model.encode_text(texts2)

In [None]:
torch.save(text_features,'text_features2.pt')

In [3]:
proj = ProjectionHead(512)

In [3]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7f9d99280e50>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [3]:
aa = transforms.Compose([
    transforms.RandomResizedCrop(size=(224,224)),
    transforms.TrivialAugmentWide()
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
])

# TRANSFORM_IMG = transforms.Compose([
#     transforms.Resize(224),
#     transforms.CenterCrop(224),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406],
#                          std=[0.229, 0.224, 0.225] )
# ])
aa

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [8]:

preprocess(torch.zeros((3,533,544)))






AttributeError: 'Tensor' object has no attribute 'convert'