In [None]:
import os, shutil
shutil.copy(os.path.join('../input/convnext-roberta-final-v1-20epochs/convnextroberta20.pt'), os.path.join('/kaggle/working/'))

Mounted at /content/gdrive


In [None]:
!pip install timm
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.6.7-py3-none-any.whl (509 kB)
[K     |████████████████████████████████| 509 kB 33.9 MB/s 
Installing collected packages: timm
Successfully installed timm-0.6.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 31.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.4 MB/s 
Collecting huggingface-hub<1.0,>=

In [None]:
import os
import cv2
import gc
import numpy as np
import pandas as pd
import itertools
from tqdm.autonotebook import tqdm
import albumentations as A
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F
import timm
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer 

  import sys


In [None]:
class CFG:
    debug = False
    image_path = "../input/flickr-image-dataset/flickr30k_images/flickr30k_images"
    captions_path = "."
    batch_size = 32
    num_workers = 2
    head_lr = 1e-3
    image_encoder_lr = 1e-5
    text_encoder_lr = 1e-5
    weight_decay = 1e-3
    patience = 1
    factor = 0.8
    epochs = 5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #model_name = 'vit_base_patch32_224_in21k'
    model_name = 'convnext_base_in22ft1k'
    image_embedding = 1024
    text_encoder_model = "roberta-base"
    text_embedding = 768
    text_tokenizer = "roberta-base"
    max_length = 200

    pretrained = True # for both image encoder and text encoder
    trainable = True # for both image encoder and text encoder
    temperature = 1.0

    # image size
    size = 224

    # for projection head; used for both image and text encoders
    num_projection_layers = 1
    projection_dim = 256 
    dropout = 0.1

## Image Encoder

In [None]:
class ImageEncoder(nn.Module):
    """
    Encode images to a fixed size vector
    """

    def __init__(
        self, model_name=CFG.model_name, pretrained=CFG.pretrained, trainable=CFG.trainable
    ):
        super().__init__()
        self.model = timm.create_model(
            model_name, pretrained, num_classes=0,
        )
        for p in self.model.parameters():
            p.requires_grad = trainable

    def forward(self, x):
        return self.model(x)

## Text Encoder

In [None]:
class TextEncoder(nn.Module):
    def __init__(self, model_name=CFG.text_encoder_model, pretrained=CFG.pretrained, trainable=CFG.trainable):
        super().__init__()
        if pretrained:
            self.model = RobertaModel.from_pretrained(model_name)
        else:
            self.model = RobertaModel(config=RobertaConfig())
            
        for p in self.model.parameters():
            p.requires_grad = trainable

        # we are using the CLS token hidden representation as the sentence's embedding
        self.target_token_idx = 0

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
        return last_hidden_state[:, self.target_token_idx, :]

## Projection Head

In [None]:
class ProjectionHead(nn.Module):
    def __init__(
        self,
        embedding_dim,
        projection_dim=CFG.projection_dim,
        dropout=CFG.dropout
    ):
        super().__init__()
        self.projection = nn.Linear(embedding_dim, projection_dim)
        self.gelu = nn.GELU()
        self.fc = nn.Linear(projection_dim, projection_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(projection_dim)
    
    def forward(self, x):
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x

## CLIP

In [None]:
class CLIPModel(nn.Module):
    def __init__(
        self,
        temperature=CFG.temperature,
        image_embedding=CFG.image_embedding,
        text_embedding=CFG.text_embedding,
    ):
        super().__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.image_projection = ProjectionHead(embedding_dim=image_embedding)
        self.text_projection = ProjectionHead(embedding_dim=text_embedding)
        self.temperature = temperature

    def forward(self, batch):
        # Getting Image and Text Features
        image_features = self.image_encoder(batch["image"])
        text_features = self.text_encoder(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        # Getting Image and Text Embeddings (with same dimension)
        image_embeddings = self.image_projection(image_features)
        text_embeddings = self.text_projection(text_features)

        # Calculating the Loss
        logits = (text_embeddings @ image_embeddings.T) / self.temperature
        images_similarity = image_embeddings @ image_embeddings.T
        texts_similarity = text_embeddings @ text_embeddings.T
        targets = F.softmax(
            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
        )
        texts_loss = cross_entropy(logits, targets, reduction='none')
        images_loss = cross_entropy(logits.T, targets.T, reduction='none')
        loss =  (images_loss + texts_loss) / 2.0 # shape: (batch_size)
        return loss.mean()


def cross_entropy(preds, targets, reduction='none'):
    log_softmax = nn.LogSoftmax(dim=-1)
    loss = (-targets * log_softmax(preds)).sum(1)
    if reduction == "none":
        return loss
    elif reduction == "mean":
        return loss.mean()

In [None]:
def get_model(model_path):
    model = CLIPModel().to(CFG.device)
    model.load_state_dict(torch.load(model_path, map_location=CFG.device))
    return model
model = get_model("convnextroberta20.pt")
print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")

Downloading: "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth" to /root/.cache/torch/hub/checkpoints/convnext_base_22k_1k_224.pth


Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model parameters: 212,803,968


#  ****EXEMPLE 1****

In [None]:
import pandas as pd
import clip
import torch
from PIL import Image
import os
from pathlib import Path
import urllib.request
from multiprocessing.pool import ThreadPool
import math
import numpy as np
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 151MiB/s]


In [None]:
preprocess = Compose([
    Resize(CFG.size, interpolation=Image.BICUBIC),
    CenterCrop(CFG.size),
    ToTensor()
])

In [None]:
os.makedirs('./Unsplash/photos')
os.makedirs('./Unsplash/features')

In [None]:
unsplash_dataset_path = Path("./Unsplash") 

In [None]:
# Read the photos table
photos = pd.read_csv("../input/unsplash-image-download-data/Unsplash/photos.csv", delimiter=',')

# Extract the IDs and the URLs of the photos
photo_urls = photos[['photo_id', 'photo_image_url']].values.tolist()

# Print some statistics
print(f'Photos in the dataset: {len(photo_urls)}')

Photos in the dataset: 25000


In [None]:
# Path where the photos will be downloaded
photos_donwload_path = unsplash_dataset_path / "photos"

# Function that downloads a single photo
def download_photo(photo):
    # Get the ID of the photo
    photo_id = photo[0]

    # Get the URL of the photo (setting the width to 640 pixels)
    photo_url = photo[1] + "?w=640"

    # Path where the photo will be stored
    photo_path = photos_donwload_path / (photo_id + ".jpg")

    # Only download a photo if it doesn't exist
    if not photo_path.exists():
        try:
            urllib.request.urlretrieve(photo_url, photo_path)
        except:
            # Catch the exception if the download fails for some reason
            print(f"Cannot download {photo_url}")
            pass

In [None]:
# Create the thread pool
threads_count = 16
pool = ThreadPool(threads_count)

# Start the download
pool.map(download_photo, photo_urls)

# Display some statistics
display(f'Photos downloaded: {len(photos)}')

Cannot download https://images.unsplash.company%20by%20Alessandro%20Desantis%20-%20Downloaded%20from%20500px_jpg.jpg?w=640
Cannot download https://images.unsplash.com/photo-1481930079977-24a345fcae85?w=640
Cannot download https://images.unsplash.com_TheBeach.jpg?w=640
Cannot download https://images.unsplash.com-grass-sun.jpg?w=640


'Photos downloaded: 25000'

In [None]:
# Set the path to the photos
photos_path = Path("./Unsplash/photos")
# List all JPGs in the folder
photos_files = list(photos_path.glob("*.jpg"))
# Print some statistics
print(f"Photos found: {len(photos_files)}")

Photos found: 24996


In [None]:
# Function that computes the feature vectors for a batch of images
def compute_clip_features(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    
    # Preprocess all photos
    photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)

    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.image_encoder(photos_preprocessed.to(CFG.device))
        photos_features = model.image_projection(photos_features)
        photos_features = F.normalize(photos_features, p=2, dim=-1)

    # Transfer the feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

In [None]:
# Define the batch size so that it fits on your GPU. You can also do the processing on the CPU, but it will be slower.
batch_size = 16

# Path where the feature vectors will be stored
features_path = Path("./Unsplash") / "features"

# Compute how many batches are needed
batches = math.ceil(len(photos_files) / batch_size)

# Process each batch
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")

    batch_ids_path = features_path / f"{i:010d}.csv"
    batch_features_path = features_path / f"{i:010d}.npy"
    
    # Only do the processing if the batch wasn't processed yet
    if not batch_features_path.exists():
        try:
            # Select the photos for the current batch
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]

            # Compute the features and save to a numpy file
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)

            # Save the photo IDs to a CSV file
            photo_ids = [photo_file.name.split(".")[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Catch problems with the processing to make the process more robust
            print(f'Problem with batch {i}')

Processing batch 1/1563
Processing batch 2/1563
Processing batch 3/1563
Processing batch 4/1563
Processing batch 5/1563
Processing batch 6/1563
Processing batch 7/1563
Processing batch 8/1563
Processing batch 9/1563
Processing batch 10/1563
Processing batch 11/1563
Processing batch 12/1563
Processing batch 13/1563
Processing batch 14/1563
Processing batch 15/1563
Processing batch 16/1563
Processing batch 17/1563
Processing batch 18/1563
Processing batch 19/1563
Processing batch 20/1563
Processing batch 21/1563
Processing batch 22/1563
Processing batch 23/1563
Processing batch 24/1563
Processing batch 25/1563
Processing batch 26/1563
Processing batch 27/1563
Processing batch 28/1563
Processing batch 29/1563
Processing batch 30/1563
Processing batch 31/1563
Processing batch 32/1563
Processing batch 33/1563
Processing batch 34/1563
Processing batch 35/1563
Processing batch 36/1563
Processing batch 37/1563
Processing batch 38/1563
Processing batch 39/1563
Processing batch 40/1563
Processin

In [None]:
import numpy as np
import pandas as pd

# Load all numpy files
features_list = [np.load(features_file) for features_file in sorted(features_path.glob("*.npy"))]

# Concatenate the features and store in a merged file
features = np.concatenate(features_list)
np.save(features_path / "features.npy", features)

# Load all the photo IDs
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(features_path.glob("*.csv"))])
photo_ids.to_csv(features_path / "photo_ids.csv", index=False)

In [None]:
photo_ids = pd.read_csv("./Unsplash/features/photo_ids.csv")
photo_ids = list(photo_ids['photo_id'])

# Load the features vectors
photo_features = np.load("./Unsplash/features/features.npy")
# Convert features to Tensors: Float32 on CPU and Float16 on GPU
if device == "cpu":
    photo_features = torch.from_numpy(photo_features).float().to(device)
else:
    photo_features = torch.from_numpy(photo_features).to(device)

# Print some statistics
print(f"Photos loaded: {len(photo_ids)}")


Photos loaded: 24996


In [None]:
def encode_search_query(search_query):
        
    # Encode and normalize the search query using CLIP
    tokenizer = RobertaTokenizer.from_pretrained(CFG.text_tokenizer)
    query_text = tokenizer([search_query])
    batch = {
        key: torch.tensor(values).to(CFG.device)
        for key, values in query_text.items()
        }
    with torch.no_grad():
        text_encoded = model.text_encoder( 
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"] )
        text_encoded = model.text_projection(text_encoded)
        text_encoded = F.normalize(text_encoded, p=2, dim=-1)

    # Retrieve the feature vector
    return text_encoded

In [None]:
def find_best_matches(text_features, photo_features, photo_ids, results_count=3):
  # Compute the similarity between the search query and each photo using the Cosine similarity
  similarities = (photo_features @ text_features.T).squeeze(1)

  # Sort the photos by their similarity score
  best_photo_idx = (-similarities).argsort()

  # Return the photo IDs of the best matches
  return [photo_ids[i] for i in best_photo_idx[:results_count]]

In [None]:
from IPython.display import Image
from IPython.core.display import HTML

def display_photo(photo_id):
  # Get the URL of the photo resized to have a width of 320px
  photo_image_url = f"https://unsplash.com/photos/{photo_id}/download?w=320"

  # Display the photo
  display(Image(url=photo_image_url))

  # Display the attribution text
  #display(HTML(f'Photo on <a target="_blank" href="https://unsplash.com/photos/{photo_id}">Unsplash</a> '))
  print()

In [None]:
def search_in_unsplash(search_query, photo_features, photo_ids, results_count=3):
  # Encode the search query
  text_features = encode_search_query(search_query)

  # Find the best matches
  best_photo_ids = find_best_matches(text_features, photo_features, photo_ids, results_count)

  # Display the best photos
  for photo_id in best_photo_ids:
    display_photo(photo_id)


In [None]:
search_query = "Two dogs playing in the snow"

search_in_unsplash(search_query, photo_features, photo_ids, 2)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]







In [None]:
search_query = "a man surfing on a blue green wave"

search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "Groups of people enjoying walking on a road in the forest"

search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "Friends taking a boat ride"
search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "A group of kittens sleeping next to each other"
search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "Two people walking on desert sands at sunset"
search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "Two friends standing in front of each other in the woods"
search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "Two friends sitting next to each other on the beach"
search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "A person standing on top of a high mountain"
search_in_unsplash(search_query, photo_features, photo_ids, 2)







In [None]:
search_query = "A person standing at the top of a high mountain covered in snow"
search_in_unsplash(search_query, photo_features, photo_ids, 2)







**Combine Text and Photo Seach Queries****

In [None]:
def search_by_text_and_photo_in_unsplash(query_text, query_photo_id, photo_weight=0.5):
    # Encode the search query
    text_features = encode_search_query(query_text)

    # Find the feature vector for the specified photo ID
    query_photo_index = photo_ids.index(query_photo_id)
    query_photo_features = photo_features[query_photo_index]

    # Combine the test and photo queries and normalize again
    search_features = text_features + query_photo_features * photo_weight
    search_features /= search_features.norm(dim=-1, keepdim=True)

    # Find the best match
    best_photo_ids = find_best_matches(search_features, photo_features, photo_ids, 1)

    # Display the results
    #print("Test search result")
    #search_in_unsplash(query_text, photo_features, photo_ids, 1)
    print("Text query is",query_text)
    print("Photo query")
    display(Image(url=f"https://unsplash.com/photos/{query_photo_id}/download?w=320"))

    print("Result for text query + photo query")
    display_photo(best_photo_ids[0])

In [None]:
search_by_text_and_photo_in_unsplash("In winter", "BeD3vjQ8SI0")

Text query is In winter
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("In winter", "ocwmWiNAWGs")

Text query is In winter
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("Early in the morning", "WKt6okFYtg4")

Text query is Early in the morning
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("Many of them at night", "WKt6okFYtg4")

Text query is Many of them at night
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("luminous at night", "WKt6okFYtg4")

Text query is luminous at night
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("A long way", "9HDfRHhCxME")

Text query is A long way
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("blue", "ZVcdVxjACGI")

Text query is blue
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("savage", "8Ja70K5VvoI")

Text query is savage
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("yellow", "Ud_H_sPU85I")

Text query is yellow
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("blue", "Ud_H_sPU85I")

Text query is blue
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("bird", "sYzFIusQp3Q")

Text query is bird
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("big", "71xCMzMD1ts")

Text query is big
Photo query


Result for text query + photo query





In [None]:
search_by_text_and_photo_in_unsplash("in the sea", "BLTcKiKXbzU")

Text query is in the sea
Photo query


Result for text query + photo query





## Final words