# Procesamiento multimedia de imágenes

## a) Reddit Web Scraper

In [3]:
import praw
import requests
import os
import re
import time
from urllib.parse import urlparse

# Set up the Reddit client using your credentials
reddit = praw.Reddit(
    client_id="dGX9ww8jhAQhSLep15Puaw",
    client_secret="d1PrINQmWdE_NvpQ1PI7NT8DsuEG4g",
    user_agent="ImageScraperBot/0.1 by YOUR_REDDIT_USERNAME"
)

def sanitize_filename(filename):
    # Remove any characters that are not alphanumeric or underscores
    return re.sub(r'[\\/*?:"<>|]', "", filename)

def download_image(url, path, retries=3):
    for attempt in range(retries):
        try:
            response = requests.get(url, stream=True, timeout=10)
            if response.status_code == 200:
                with open(path, "wb") as file:
                    for chunk in response.iter_content(1024):
                        file.write(chunk)
                print(f"Downloaded {path}")
                return True
            else:
                print(f"Failed to download {url}, status code {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
        time.sleep(2)  # Wait before retrying
    return False

def get_processed_posts(log_file):
    processed_posts = set()
    if os.path.exists(log_file):
        with open(log_file, "r") as log:
            for line in log:
                post_id = line.split(" - ")[0]
                processed_posts.add(post_id)
    return processed_posts

def is_image_url(url):
    parsed_url = urlparse(url)
    return parsed_url.path.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))

def download_images_from_subreddit(subreddit_name, image_limit=2000, download_dir="images", log_file="posts_log.txt"):
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    processed_posts = get_processed_posts(log_file)
    image_count = 0  # Initialize image counter

    try:
        subreddit = reddit.subreddit(subreddit_name)
        # Check if subreddit exists and is accessible
        subreddit.id  # This will raise an exception if the subreddit doesn't exist
    except Exception as e:
        print(f"Error accessing subreddit: {e}")
        return

    with open(log_file, "a") as log:
        try:
            for submission in subreddit.top(limit=None):
                if submission.id in processed_posts:
                    continue  # Skip already processed posts

                if image_count >= image_limit:
                    break  # Stop when we reach the specified image limit

                if is_image_url(submission.url):
                    img_url = submission.url
                    img_name = f"{submission.id}_{sanitize_filename(submission.title[:50])}.jpg"
                    img_path = os.path.join(download_dir, img_name)

                    # Download the image with retry mechanism
                    if download_image(img_url, img_path):
                        log.write(f"{submission.id} - {submission.title}\n")
                        image_count += 1  # Increment the image counter after successful download
                else:
                    print(f"No image found in post: {submission.id} - {submission.title}")

                # Pause to respect API rate limits every 100 images
                if image_count % 100 == 0 and image_count > 0:
                    print("Pausing to respect API rate limits...")
                    time.sleep(60)  # Adjust sleep time as needed
        except Exception as e:
            print(f"Error during processing: {e}")

    print(f"Completed downloading {image_count} images from r/{subreddit_name}")

# Use the function to download images from a subreddit, e.g., 'memexico'
download_images_from_subreddit("dankgentina", image_limit=2000)


Downloaded images/i2z0e8_..jpg
Downloaded images/oifjwg_Un upvote y lo subo a ritaly.jpg
Downloaded images/l3lrjj_the virgin southamerica vs the chad africa.jpg
Downloaded images/kjhiwc_así de simple flaco.jpg
Downloaded images/k236u8_isi.jpg
Downloaded images/kmfj7q_Escuchando el chaqueño en miami.jpg
Downloaded images/h1792z_Como estaremos nosotros no .jpg
Downloaded images/icna1b_Saquenme de aca.jpg
Downloaded images/nfenuy_...jpg
No image found in post: v4cwig - Argentina Slander
Downloaded images/jidnrw_😈😈😈.jpg
Downloaded images/nsk9bi_Himno be like.jpg
Downloaded images/kv3nvo_Eeeeeeeeeeh.jpg
Downloaded images/lj5uyp_Otakus be like.jpg
Downloaded images/hhyiq5_Ojalá.jpg
Downloaded images/fz5c9c_Ahora que el bondi va vacío me doy cuenta que el d.jpg
Downloaded images/heg42s_FEBO ASOMA.jpg
Downloaded images/psa24l_chupala gallego.jpg
No image found in post: lk1l3d - Disculpen lo largo del video.
Downloaded images/khh0vx_._..jpg
Downloaded images/glh1cq_Mogul.jpg
Downloaded images/i

KeyboardInterrupt: 

## b) Captioning the images

In [5]:
import os
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Initialize the BLIP processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Directory containing images
image_dir = "images/"
# Output file for captions
output_file = "image_captions.txt"

# Ensure the image directory exists
if not os.path.exists(image_dir):
    print(f"Directory '{image_dir}' does not exist.")
    exit()

# Get a list of image files in the directory
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

if not image_files:
    print(f"No image files found in '{image_dir}'.")
    exit()

# Open the output file
with open(output_file, "w") as file:
    # Process each image
    for image_name in image_files:
        image_path = os.path.join(image_dir, image_name)
        try:
            # Open and process the image
            raw_image = Image.open(image_path).convert('RGB')
            inputs = processor(raw_image, return_tensors="pt")

            # Generate caption
            with torch.no_grad():
                outputs = model.generate(**inputs)
            caption = processor.decode(outputs[0], skip_special_tokens=True)

            # Write the caption to the file
            file.write(f"{image_name}: {caption}\n")
            print(f"Processed '{image_name}': {caption}")
        except Exception as e:
            print(f"Error processing '{image_name}': {e}")

print(f"Captions saved to '{output_file}'.")


  from .autonotebook import tqdm as notebook_tqdm


Processed '112xz0b_Rento salón para eventos....jpg': a building with a sign that says soleventosuals
Processed '1678mds_papá_ctm.jpg': a text message with the words ' i don '
Processed '13e2gpi_Republicanos _ctm.jpg': repulas de la guardia cona
Processed 'jbe0uj_No lo entenderías.jpg': a poster with two men in suits and ties
Processed '17u9c5c_yo_ctm.jpg': a laptop computer sitting on top of a desk
Processed 'yucn9s_Marchas Fifí XD.jpg': a poster with the faces of the actors
Processed '1e6avdi_Yo_ctm.jpg': a text message that reads, ` ` ' ' ' ' ' ' ' ' ' ' '
Processed 'iltx2v_Y si tengo que morir, moriré.jpg': a cartoon character with a capt that reads, ' el ques las pass de mad
Processed '1ctim8v_Yo_ctm.jpg': a tv on a wall
Processed 'luh4s8_Excelentes seres humanos ;).jpg': a screenshot of a computer screen with the words ' ' ' ' ' and ' ' '
Processed '128p7zk_😏😏.jpg': a screenshot of a text message from a woman who was not allowed to the text message
Processed '1gixbmy_yo👶ctm.jpg': 