In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from transformers import BlipProcessor, BlipForConditionalGeneration
import time
import random


# Load the dataset
data_file_path = 'giphy.csv'
giphy_data = pd.read_csv(data_file_path)

filtered_df = giphy_data.loc[2466:]

# Initialize BLIP processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def fetch_image_from_url(url, retries=3, delay=2):
    """Fetch image content directly from a URL with retries and rate limiting."""
    for attempt in range(retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content)).convert("RGB")
            return image
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:  # Too Many Requests
                print(f"Rate limited. Retrying after {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            else:
                print(f"HTTP error fetching image from {url}: {e}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Error fetching image from {url}: {e}")
            return None
        except Exception as e:
            print(f"General error with URL {url}: {e}")
            return None
    print(f"Failed to fetch image from {url} after {retries} retries.")
    return None

def generate_description_from_url(url):
    """Generate a description for an image fetched from a URL."""
    image = fetch_image_from_url(url)
    if image is None:
        return None
    try:
        inputs = processor(image, return_tensors="pt")
        output = model.generate(**inputs)
        description = processor.decode(output[0], skip_special_tokens=True)
        return description
    except Exception as e:
        print(f"Error generating description for URL {url}: {e}")
        return None

# Main processing with delay between requests
descriptions = []
for idx, url in enumerate(filtered_df["Web Archive Link"].tolist()):
    print(f"Processing URL {idx + 1}/{len(filtered_df)}: {url}")
    try:
        description = generate_description_from_url(url)
        descriptions.append(description)
        if description:
            print(f"Generated Description: {description}")
        else:
            print(f"No description generated for URL {url}")
    except Exception as e:
        print(f"Skipping URL {url} due to unexpected error: {e}")
        descriptions.append(None)
    time.sleep(random.uniform(1, 3))  # Add a random delay between 1 and 3 seconds to avoid rate limiting

# Add descriptions as a new column
filtered_df["Descriptions"] = descriptions

# Save the updated dataset
output_file = "giphy_with_descriptions.csv"
filtered_df.to_csv(output_file, index=False)
print(f"Updated dataset saved to {output_file}")


Processing URL 1/8506: http://webarchive.loc.gov/all/20150318155641/https://media.giphy.com/media/12uPt6docW77iw/giphy.gif
Rate limited. Retrying after 2 seconds...
Rate limited. Retrying after 4 seconds...
Rate limited. Retrying after 8 seconds...
Failed to fetch image from http://webarchive.loc.gov/all/20150318155641/https://media.giphy.com/media/12uPt6docW77iw/giphy.gif after 3 retries.
No description generated for URL http://webarchive.loc.gov/all/20150318155641/https://media.giphy.com/media/12uPt6docW77iw/giphy.gif
Processing URL 2/8506: http://webarchive.loc.gov/all/20150318155641/http://media.giphy.com/media/12UR5AXD3JXoME/giphy.gif
Rate limited. Retrying after 2 seconds...
Rate limited. Retrying after 4 seconds...
Rate limited. Retrying after 8 seconds...
Failed to fetch image from http://webarchive.loc.gov/all/20150318155641/http://media.giphy.com/media/12UR5AXD3JXoME/giphy.gif after 3 retries.
No description generated for URL http://webarchive.loc.gov/all/20150318155641/http:

KeyboardInterrupt: 