In [None]:
%pip install -r requirements.txt

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv

# Specify the path to your .env file
env_path = '../.env'

# Load the .env file
load_dotenv(dotenv_path=env_path)

# Read the photos table
photos_csv = pd.read_csv("dataset/photos.tsv000", sep='\t', header=0)

# Extract the IDs and the URLs of the photos
photos_info_list = photos_csv[['photo_id', 'photo_image_url', 'photo_description', 'photo_width', 'photo_height', 'photo_aspect_ratio', 'photo_location_name', 'ai_description']].values.tolist()

# Print some statistics
print(f'Photos in the dataset: {len(photos_info_list)}')
print(f'Sample photo list-item: {photos_info_list[0]}')


In [None]:
# Download the photos in the local folder
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import urllib.request
from multiprocessing.pool import ThreadPool

# Path where the photos will be downloaded
photos_download_path = Path("photos")

# Ensure the download directory exists
photos_download_path.mkdir(exist_ok=True)

# Get list of already downloaded images
existing_images = set(file.stem for file in photos_download_path.glob('*.jpg'))

# Filter the dataframe to include only images that haven't been downloaded
images_to_download = photos_csv[~photos_csv['photo_id'].isin(existing_images)]

print(f"Total images in TSV: {len(photos_csv)}")
print(f"Already downloaded: {len(existing_images)}")
print(f"Images to download: {len(images_to_download)}")

def download_photo(photo):
    photo_id, photo_url = photo
    # Add width specification to the URL
    photo_url = photo_url + "?w=640"
    photo_path = photos_download_path / f"{photo_id}.jpg"

    if not photo_path.exists():
        try:
            urllib.request.urlretrieve(photo_url, photo_path)
        except Exception as e:
            print(f"Cannot download {photo_url}: {str(e)}")

# Create the thread pool
threads_count = 16
pool = ThreadPool(threads_count)

# Prepare the list of photos to download
photos_to_download = list(images_to_download[['photo_id', 'photo_image_url']].itertuples(index=False, name=None))

# Start the download with progress bar
for _ in tqdm(pool.imap_unordered(download_photo, photos_to_download), total=len(photos_to_download)):
    pass

# Close the pool
pool.close()
pool.join()

# Display statistics
print(f'Photos downloaded: {len(list(photos_download_path.glob("*.jpg")))}')

In [None]:
from pinecone import Pinecone, ServerlessSpec
import os

# Get the API key from the environment variable
api_key = os.getenv('PINECONE_API_KEY')
index_name = os.getenv('PINECONE_INDEX_NAME')

# Check if the API key is set
if not api_key:
    raise ValueError("PINECONE_API_KEY environment variable is not set")

# Initialize Pinecone with the API key from the environment variable
pc = Pinecone(api_key=api_key)

# List all indexes
existing_indexes = pc.list_indexes()

print(f"Existing indexes: {existing_indexes}")

# Check if the index exists
if index_name in [index.name for index in existing_indexes]:
    print(f"Index {index_name} already exists")
    index = pc.Index(index_name)
else:
    # Create the index if it doesn't exist
    pc.create_index(
      name=index_name,
      dimension=768,
      metric="cosine",
      spec=ServerlessSpec(
          cloud='aws',
          region='us-east-1'
      )
    ) 
    print(f"Created new index: {index_name}")
    index = pc.Index(index_name)

print(f"Index {index_name} ready")

In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import os
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed

# Load CLIP model
model_id = os.getenv('CLIP_MODEL_ID')
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

# Move model to device if possible
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
model.to(device)

# Function to create image embeddings
def create_image_embeddings(image):
    vals = processor(text=None, images=image, return_tensors="pt")["pixel_values"].to(device)
    with torch.no_grad():
        image_embedding = model.get_image_features(vals)
    return image_embedding[0].cpu().numpy().tolist()

# Path to your photos folder
photos_path = Path("photos")

# Create a dictionary for quick lookup
photo_info = {
    item[0]: {
        "url": item[1],
        "description": item[2] if not pd.isna(item[2]) else "No description available",
        "width": item[3],
        "height": item[4],
        "aspect_ratio": item[5],
        "location_name": item[6] if not pd.isna(item[6]) else "No location available",
        "ai_description": item[7] if not pd.isna(item[7]) else "No AI description available",
    } 
    for item in photos_info_list
}

def process_image(img_path):
    try:
        image = Image.open(img_path)
        embedding = create_image_embeddings(image)
        photo_id = img_path.stem
        info = photo_info.get(photo_id, {})
        return {
            "id": photo_id,
            "values": embedding,
            "metadata": {
                "photo_image_url": info.get("url", ""),
                "photo_description": info.get("description", ""),
                "photo_width": info.get("width", ""),
                "photo_height": info.get("height", ""),
                "photo_aspect_ratio": info.get("aspect_ratio", ""),
                "photo_location_name": info.get("location_name", ""),
                "ai_description": info.get("ai_description", "")
            }
        }
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return None


# Create the thread pool
threads_count = 32
pool = ThreadPool(threads_count)

# Prepare the list of photos to process
photos_to_process = list(photos_path.glob("*.jpg"))

# Batch size for upserting to Pinecone
batch_size = 500
vectors = []
total_processed = 0

# Start the processing with progress bar
for result in tqdm(pool.imap_unordered(process_image, photos_to_process), total=len(photos_to_process)):
    if result:
        vectors.append(result)
        
        if len(vectors) >= batch_size:
            index.upsert(vectors=vectors, namespace="unsplashlite")
            total_processed += len(vectors)
            print(f"\nUpserted batch of {len(vectors)} vectors. Total processed: {total_processed}")
            print("Current index stats:")
            print(index.describe_index_stats())
            vectors = []

# Close the pool
pool.close()
pool.join()

# Upsert any remaining vectors
if vectors:
    index.upsert(vectors=vectors, namespace="unsplashlite")
    total_processed += len(vectors)
    print(f"\nUpserted final batch of {len(vectors)} vectors. Total processed: {total_processed}")
    print("Final index stats:")
    print(index.describe_index_stats())

print("\nProcessing complete!")
print(f'Total photos processed: {total_processed}')