In [None]:
import requests
import re
import os
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
requests.packages.urllib3.disable_warnings()

In [3]:
def get_apod(url, adir, sess):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; rv:84.0) Gecko/20100101 Firefox/84.0"
    }
    if not os.path.exists(adir):
        os.makedirs(adir, exist_ok=False)

    parenturl = os.path.split(url)[0]
    spaceregex = re.compile(r"\s{2,}")

    try:
        apod = sess.get(url, timeout=30, headers=headers, verify=True)
        apod.raise_for_status()
    except Exception as e:
        print(f"Error getting page {url}: {e}")
        return None

    apodsoup = BeautifulSoup(apod.text, features="lxml")
    imgelem = apodsoup.find_all("a", href=re.compile("^image"))

    if imgelem == []:
        print(f"No image link found for {url}")
        return None
    else:
        imgurl = parenturl + "/" + imgelem[0].get("href")
        imgfilename = os.path.basename(imgurl)
        imgdate = imgelem[0].find_previous("p").getText(strip=True)

        # Vérifie si le fichier existe déjà
        if not os.path.exists(os.path.join(adir, imgfilename)):
            try:
                imageresp = sess.get(
                    imgurl,
                    headers=headers,
                    timeout=30,
                    stream=True,
                    verify=True,
                )
                imageresp.raise_for_status()
                
                # Redimensionnement de l'image
                img = Image.open(BytesIO(imageresp.content))
                img.thumbnail((256, 256))  # Redimensionner l'image à 256x256 max

                # Enregistre l'image redimensionnée
                with open(os.path.join(adir, imgfilename), "wb") as fd:
                    img.save(fd)
                imageresp.close()

            except Exception as e:
                print(f"Error downloading image {imgurl}: {e}")

        return imgdate

In [4]:
def date_to_url(date):
    return f"https://apod.nasa.gov/apod/ap{date.strftime('%y%m%d')}.html"

In [5]:
def download_apod_images(start_date, end_date, adir="apod-images", max_workers=5):
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    
    current_date = start_date
    urls = []
    
    while current_date <= end_date:
        urls.append(date_to_url(current_date))
        current_date += timedelta(days=1)

    print(f"Début du téléchargement des images APOD du {start_date} au {end_date} dans {adir} avec {max_workers} threads")
    with requests.Session() as sess:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = {executor.submit(get_apod, url, adir, sess): url for url in urls}
            for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading APOD images", miniters=100):
                url = futures[future]
                try:
                    result = future.result()
                except Exception as exc:
                    print(f"Error downloading {url}: {exc}")

In [None]:
# Paramètres par défaut
SAVEDIR = "download_test"
start_date = "2015-01-01"  # Date de début au format AAAA-MM-JJ
end_date = "2022-12-05"    # Date de fin au format AAAA-MM-JJ

# Exécution
download_apod_images(start_date, end_date, adir=SAVEDIR, max_workers=10)

In [None]:
os.makedirs(os.path.join(SAVEDIR, "train_black_and_white"), exist_ok=True)
os.makedirs(os.path.join(SAVEDIR, "train_color"), exist_ok=True)

for filename in tqdm(os.listdir(SAVEDIR), desc="Converting images to black and white and color"):
    file_path = os.path.join(SAVEDIR, filename)
    if os.path.isfile(file_path):  # Ensure it's a file
        img = Image.open(file_path).convert("L")
        img.save(os.path.join(SAVEDIR, "train_black_and_white", filename))
        img = Image.open(file_path).convert("RGB")
        img.save(os.path.join(SAVEDIR, "train_color", filename))
        os.remove(file_path)

In [None]:
# Convert everything in PNG
for root, dirs, files in os.walk("dataset"):
    for file in tqdm(files, desc="Converting images to png"):
        extension = os.path.splitext(file)[1]
        if extension in [".jpg", ".jpeg", ".JPG", ".JPEG", ".bmp", ".BMP"]:
            img = Image.open(os.path.join(root, file))
            img.save(os.path.join(root, file.replace(extension, ".png")))
            os.remove(os.path.join(root, file))
        elif extension in [".gif", ".GIF"]:
            # Keep only the first frame of the gif
            img = Image.open(os.path.join(root, file))
            img.seek(0)
            img.save(os.path.join(root, file.replace(extension, ".png")))
            img.close()
            os.remove(os.path.join(root, file)) 