# Project: Image Retrieval using CLIP Model 
### Dataset: 
1. Web Scraped Google Images + Caption from ChatGPT 
2. Google's Conceptual Captioning Dataset Collection

### Notebooks:

* Notebook 1: Dataset Collection
* Notebook 2: Training CLIP Model
* Notebook 3: Evaluation and Inference
* Model Deployment: Docker + FastAPI 

# Importing Libraries

In [1]:
!pip install datasets



In [2]:
import pandas as pd
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
import os
import threading
import requests
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
import shutil

# Data Collection

In [3]:
dataset = pd.read_csv("/kaggle/input/gcc-clip/Train_GCC-training.tsv", sep="\t", header=None)
dataset = dataset.rename(columns={0: "caption", 1: "image_url"})
dataset = Dataset.from_pandas(dataset)
dataset = dataset.filter(lambda example, idx: idx < 65000, with_indices=True)
dataset

  0%|          | 0/3319 [00:00<?, ?ba/s]

Dataset({
    features: ['caption', 'image_url'],
    num_rows: 65000
})

In [4]:
# Get the number of available processors
num_processors = os.cpu_count()

# Get the number of active threads (including main thread)
num_threads = threading.active_count()

print("Number of available processors:", num_processors)
print("Number of active threads:", num_threads)

Number of available processors: 4
Number of active threads: 8


In [5]:
def download_and_process_images(instance):
    session = requests.Session()
    try:
        image_url = instance["image_url"]

        # Send a GET request to the image URL with a timeout
        response = session.get(image_url, timeout=1)  # Adjust the timeout as needed

        # Check if the request was successful
        if response.status_code == 200:
            # Load the image using PIL (Python Imaging Library)
            image = Image.open(BytesIO(response.content))
            
#             if image.size[0]>512:
#                 image = image.resize((512, image.size[1]))
#             elif image.size[1]>512:
#                 image = image.resize((image.size[0], 512))

            instance["image_data"] = image
            instance["drop_this_row"] = "no"
        else:
            instance["image_data"] = Image.fromarray(np.zeros((8,8,3),dtype=np.uint8))
            instance["drop_this_row"] = "yes"

    except requests.exceptions.Timeout:
        instance["image_data"] = Image.fromarray(np.zeros((8,8,3),dtype=np.uint8))
        instance["drop_this_row"] = "yes"
        # print("Timeout occurred for:", image_url)
        
    except Exception as e:
        instance["image_data"] = Image.fromarray(np.zeros((8,8,3),dtype=np.uint8))
        instance["drop_this_row"] = "yes"
        # print("Error occurred for:", image_url, "Error:", str(e))
    session.close()
    return instance

In [6]:
dataset = dataset.map(download_and_process_images, batched=False ) # num_proc=1

  0%|          | 0/65000 [00:00<?, ?ex/s]



In [7]:
dataset = dataset.filter(lambda instance: instance["drop_this_row"] == "no", with_indices=False)
dataset = dataset.remove_columns(["drop_this_row","image_url"])
dataset

  0%|          | 0/65 [00:00<?, ?ba/s]

Dataset({
    features: ['caption', 'image_data'],
    num_rows: 53540
})

In [8]:
try:
    dataset.save_to_disk("/kaggle/working/Google_Conceptual_Caption_Dataset_Images_Captions")
except:
    print("Error in saving dataset locally")
    
print("yo")

Flattening the indices:   0%|          | 0/54 [00:00<?, ?ba/s]

yo


In [9]:
# from huggingface_hub import notebook_login
# from huggingface_hub import login

# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_value_0 = user_secrets.get_secret("SHIRSH_HUGGINGFACE_API_KEY")

# login(token=secret_value_0, write_permission=True)
# flag = False
# try:
#     dataset.push_to_hub("shirsh10mall/Image_Captioning_Dataset")
#     flag=True
# except:
#     print("Error in pushing dataset to Hugging Face")

In [10]:
# if flag:
#     try:
#         dataset.save_to_disk("/kaggle/working/Google_Conceptual_Caption_Dataset_Images_Captions")
#     except:
#         print("Error in saving dataset locally")
        
# dataset

In [11]:
# source_path = "/kaggle/input/scraping-google-images-by-input-text-query"  # Replace with the actual source file path
# destination_path = "/kaggle/working/GoogleImage_ChatGPT_WebScraped_dataset"  # Replace with the actual destination folder path

# filenames = ["/dataset.arrow", "/dataset_info.json", "/state.json"]

# for file in filenames:
#     # Copy each file from source to destination
#     shutil.copy(source_path+file, destination_path)
    
# def convert_array_to_PIL_Image(instance):
#     instance["image_data"] = Image.fromarray(np.array( instance["image_data"], dtype=np.uint8 ) )
#     return instance

# caption_image_dataset_scraped_google_images = load_from_disk("/kaggle/working/GoogleImage_ChatGPT_WebScraped_dataset")
# caption_image_dataset_scraped_google_images = caption_image_dataset_scraped_google_images.map( convert_array_to_PIL_Image )
# caption_image_dataset_scraped_google_images

# # dataset = concatenate_datasets([dataset,caption_image_dataset_scraped_google_images])
# # dataset

# Import Data

In [12]:
# os.makedirs("/kaggle/working/Google_Conceptual_Caption_Dataset_Images_Captions", exist_ok=True)

# source_path = "/kaggle/input/image-retrieval-clip-training-conceptual-caption"  # Replace with the actual source file path
# destination_path = "/kaggle/working/Google_Conceptual_Caption_Dataset_Images_Captions"  # Replace with the actual destination folder path

# # filenames = ["/Google_Conceptual_Caption_Dataset_Images_Captions/dataset.arrow", "/Google_Conceptual_Caption_Dataset_Images_Captions/dataset_info.json", "/Google_Conceptual_Caption_Dataset_Images_Captions/state.json"]

# import os; filenames = os.listdir(source_path+"/Google_Conceptual_Caption_Dataset_Images_Captions")

# filenames = [os.path.join(source_path, "Google_Conceptual_Caption_Dataset_Images_Captions/"+file) for file in filenames]

# for file in filenames:
#     # Copy each file from source to destination
#     shutil.copy(file, destination_path)

In [13]:
# dataset = load_from_disk('/kaggle/working/Google_Conceptual_Caption_Dataset_Images_Captions')
# dataset

# Display few Image-Caption

In [14]:
# for index in range(10,20):
#     # Display the image using matplotlib
#     image_array = dataset[index]["image_data"]
#     caption = dataset[index]["caption"]
#     plt.imshow(image_array)
#     plt.axis('off')  # Turn off axes
#     print("\n\n\n Caption: ", caption )
#     plt.show()