In [2]:
# Initialize cloud storage
from google.cloud import storage
import csv
import requests
from time import sleep


BUCKET_NAME = "cloud-ai-platform-e215f7f7-a526-4a66-902d-eb69384ef0c4"
FILENAME = 'image_search/out-7123.csv'

def download_image(url):
  max_retries = 3
  retry_count = 0
  backoff_factor = 2  # Multiplier for increasing wait time between retries

  while retry_count < max_retries:
    try:
      response = requests.get(url)
      response.raise_for_status()  # Raise an error for bad status codes (e.g., 4xx, 5xx)

      return response.content  # Return the image content as bytes
    except requests.exceptions.HTTPError as err:
      retry_count += 1
      print(f"Error occurred. Attempt {retry_count} of {max_retries}.")

      if retry_count == max_retries:
          print("Max retries reached. Unable to complete the request due to repeated errors.")
          break

      # Wait before retrying again (with exponential backoff)
      sleep(backoff_factor ** retry_count)

# Function to upload an image to Google Cloud Storage
def upload_to_gcs(destination_blob_name, image_bytes):
    # Create a new blob in the bucket and upload the image
    blob = get_blob(destination_blob_name)
    blob.upload_from_string(image_bytes, content_type='image/jpeg')  # Set content type as appropriate

    print(f"Image successfully uploaded to gs://{BUCKET_NAME}/{destination_blob_name}")

def get_blob(blob_name):
  client = storage.Client()
  bucket = client.bucket(BUCKET_NAME)
  return bucket.blob(blob_name)


with get_blob(FILENAME).open() as csvfile:
  reader = csv.reader(csvfile)

  next(reader) # skip headers
  for row in reader:
    file = row[1].removesuffix('.jp2')
    stored_file = f'image_search/stacks/{row[0]}/{file}.jpg'
    image_blob = get_blob(stored_file)
    if not image_blob.exists():
      # Step 1: Download the image
      image_url = f'https://stacks.stanford.edu/image/iiif/{row[0]}/{file}/full/!800,800/0/default.jpg'
      image_bytes = download_image(image_url)

      # Step 2: Upload the image to GCS
      upload_to_gcs(f'image_search/stacks/{row[0]}/{file}.jpg', image_bytes)

      print(image_url)