In [1]:
!pip install sparqlwrapper
!pip install pandas
%pip install aiohttp aiofiles nest_asyncio

Collecting aiohttp
  Downloading aiohttp-3.9.3-cp312-cp312-win_amd64.whl.metadata (7.6 kB)
Collecting aiofiles
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting aiosignal>=1.1.2 (from aiohttp)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from aiohttp)
  Downloading frozenlist-1.4.1-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp)
  Downloading multidict-6.0.5-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp)
  Downloading yarl-1.9.4-cp312-cp312-win_amd64.whl.metadata (32 kB)
Downloading aiohttp-3.9.3-cp312-cp312-win_amd64.whl (363 kB)
   ---------------------------------------- 0.0/363.4 kB ? eta -:--:--
   ---------- ----------------------------- 92.2/363.4 kB 2.6 MB/s eta 0:00:01
   --------------------------------- ------ 307.2/363.4 kB 3.2 MB/s eta 0:00:01
   ---------------------------------------- 363.4/363.4 kB 3.8 MB/s eta 0:00:0

In [None]:
import os
try:
    os.mkdir('images')
except:
    pass

In [13]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import requests
import shutil
import os
import pandas as pd
import nest_asyncio
import asyncio
import aiofiles
import aiohttp
from pandas import DataFrame

endpoint_url = "https://query.wikidata.org/sparql"

# Get mountains
query = """
    SELECT ?qid ?itemLabel ?part ?partLabel ?image WHERE {
      ?item wdt:P31 wd:Q8502 .
      ?item wdt:P361 ?part .
      ?item wdt:P18 ?image .

    BIND(STRAFTER(STR(?item), STR(wd:)) AS ?qid) .
    SERVICE wikibase:label { #BabelRainbow
        bd:serviceParam wikibase:language "[AUTO_LANGUAGE],fr"
    }

    }
    LIMIT 100
"""

nest_asyncio.apply()

dl_dir = "./images"
dl_sem = asyncio.Semaphore(4)

async def download_images(df: DataFrame):
    download_tasks = []
    print(f"Téléchargement de {len(df)} images...")
    for _, row in df.iterrows():
        filename = f"{row['qid']}.jpg"
        url = row["image"]
        download_tasks.append(download_image(url, filename))
    await asyncio.gather(*download_tasks)

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

async def download_image(url, filename):
    headers = {"User-Agent": "Mozilla/5.0"}
    if os.path.exists(f"{dl_dir}/{filename}"):
        return
    async with aiohttp.ClientSession() as session:
            async with dl_sem:
                async with session.get(url, allow_redirects=True, headers=headers) as res:
                    if res.status != 200:
                        print(f"Impossible de télécharger l'image à l'url {url} (code {res.status})")
                        return
                    async with aiofiles.open(f"{dl_dir}/{filename}", "wb") as f:
                        await f.write(await res.read())
                        await f.close()

array = []
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    element = []
    element.append(result["qid"]["value"])
    element.append(result["itemLabel"]["value"])
    element.append(result["image"]["value"])
    element.append(result["partLabel"]["value"])
    array.append(element)
    
dataframe = pd.DataFrame(array, columns=["qid", "itemLabel", "image", "partLabel"])

dataframe

Unnamed: 0,qid,itemLabel,image,partLabel
0,Q16540,Breithorn,http://commons.wikimedia.org/wiki/Special:File...,frontière entre l'Italie et la Suisse
1,Q16546,Six Madun,http://commons.wikimedia.org/wiki/Special:File...,Q19284304
2,Q16585,Gäbris,http://commons.wikimedia.org/wiki/Special:File...,Préalpes suisses
3,Q16617,Piz Terri,http://commons.wikimedia.org/wiki/Special:File...,Ligne de partage des eaux entre mer Adriatique...
4,Q16632,Kronberg,http://commons.wikimedia.org/wiki/Special:File...,Q620982
...,...,...,...,...
95,Q30533,dôme de Rochefort,http://commons.wikimedia.org/wiki/Special:File...,ligne de partage des eaux entre mer Adriatique...
96,Q30541,dent du Géant,http://commons.wikimedia.org/wiki/Special:File...,frontière entre la France et l'Italie
97,Q30541,dent du Géant,http://commons.wikimedia.org/wiki/Special:File...,ligne de partage des eaux entre mer Adriatique...
98,Q35366,Q35366,http://commons.wikimedia.org/wiki/Special:File...,Alpes d'Arrochar


In [14]:
asyncio.run(download_images(dataframe))

Téléchargement de 100 images...


In [16]:
from concurrent.futures import Future, ThreadPoolExecutor, wait
from dataclasses import dataclass, field
import json
import math
from PIL import Image, ExifTags, TiffImagePlugin
from pprint import pprint
import numpy
from sklearn.cluster import KMeans
from pandas import DataFrame, Series
import asyncio
from enum import Enum

# Seuils d'une grande image
BIG_MIN_WIDTH = 1280
BIG_MIN_HEIGHT = 720

# Seuils d'une image moyenne
MEDIUM_MIN_WIDTH = 480
MEDIUM_MIN_HEIGHT = 360

executor = ThreadPoolExecutor(max_workers=8)

@dataclass
class SubjectMetadata(object):
    exif_tags: dict[str, str] = field(default_factory=dict)
    subject_specific_tags: dict[str, str] = field(default_factory=dict)
    width: int = field(default_factory=int)
    height: int = field(default_factory=int)
    color_mode: str = field(default_factory=str)
    dominant_colors: list[str] = field(default_factory=list)

images_metadata: dict[str, SubjectMetadata] = dict()
images_metadata_locks: dict[str, asyncio.Lock] = dict()

def parse_metadata_for_subjects(df: DataFrame):
    futures: list[Future] = [ ]
    for _, subject in df.iterrows():
        qid = subject["qid"]
        images_metadata[qid] = SubjectMetadata()
        images_metadata[qid].subject_specific_tags = subject["partLabel"]
        images_metadata_locks[qid] = asyncio.Lock()
        futures.append(executor.submit(async_wrapper_parse_metadata_for_subeject, subject))
    print(futures)
    wait(futures)

def async_wrapper_parse_metadata_for_subeject(subject: Series):
    asyncio.run(parse_metadata_for_subject(subject))

async def parse_metadata_for_subject(subject: Series):
    tasks = []
    qid = subject["qid"]
    filename = f"{qid}.jpg"
    full_path = f"{dl_dir}/{filename}"
    image = Image.open(full_path)
    tasks.append(parse_exif_tags_for_subject(qid, image))
    tasks.append(parse_image_metadata_for_subject(qid, image))
    await asyncio.gather(*tasks)

async def parse_image_metadata_for_subject(id: str, image: Image.Image) -> None:
    global images_metadata
    width = image.width
    height = image.height
    image = image.resize(size=(math.ceil(image.width/10), math.ceil(image.height/10))) # On réduit l'image, afin de fluidifier le calcul des couleurs dominantes
    color_mode = image.mode
    dominant_colors = get_image_dominant_colors(image)
    async with images_metadata_locks[id]:   
        images_metadata[id].color_mode = color_mode
        images_metadata[id].width = width
        images_metadata[id].height = height
        images_metadata[id].dominant_colors = dominant_colors

def get_image_dominant_colors(image: Image.Image, n: int = 3) -> list[str]:
    numarray = numpy.array(image.getdata(), numpy.uint16)
    clusters = KMeans(n_clusters=n, n_init=2)
    clusters.fit(numarray)
    results = [ "#%02x%02x%02x" % (int(r),int(g),int(b)) for r,g,b in clusters.cluster_centers_ ]
    return results

async def parse_exif_tags_for_subject(id: str, image: Image.Image) -> None:
    global images_metadata
    exif_tags = parse_exif_tags(image)
    async with images_metadata_locks[id]:   
        images_metadata[id].exif_tags = exif_tags

def parse_exif_tags(image: Image.Image) -> dict[str, str]:
    raw_exif_tags = image.getexif()
    exif_tags = dict()
    if len(raw_exif_tags) == 0:
        return exif_tags
    for exif_tag, val in raw_exif_tags.items():
        if exif_tag in ExifTags.TAGS:
            if isinstance(val, bytes):
                d = try_decode(val)
                if isinstance(d, str):
                    val = d
                else:
                    continue
            exif_tag_str = ExifTags.TAGS[exif_tag]
            if isinstance(val, TiffImagePlugin.IFDRational):
                val = float(val)
            exif_tags[exif_tag_str] = val
    return exif_tags

def try_decode(s: bytes) -> type[str | bytes]:
    try:
        return s.decode()
    except:
        return s

parse_metadata_for_subjects(dataframe)
pprint(images_metadata)

[<Future at 0x190218776e0 state=running>, <Future at 0x19014971ac0 state=running>, <Future at 0x19021838410 state=running>, <Future at 0x1902183be90 state=running>, <Future at 0x1902183b620 state=running>, <Future at 0x1902181e900 state=running>, <Future at 0x19021860b30 state=running>, <Future at 0x19021a84aa0 state=running>, <Future at 0x19021a85a00 state=pending>, <Future at 0x19021a84cb0 state=pending>, <Future at 0x19021a873b0 state=pending>, <Future at 0x19021a87140 state=pending>, <Future at 0x19021a874d0 state=pending>, <Future at 0x19021a60590 state=pending>, <Future at 0x19021a613a0 state=pending>, <Future at 0x19021a61970 state=pending>, <Future at 0x19021a62ba0 state=pending>, <Future at 0x19021a61850 state=pending>, <Future at 0x19021a617f0 state=pending>, <Future at 0x19021a61910 state=pending>, <Future at 0x19021a61ca0 state=pending>, <Future at 0x19021a60530 state=pending>, <Future at 0x19021a60c50 state=pending>, <Future at 0x19021a614c0 state=pending>, <Future at 0x19

In [25]:
import dataclasses
from fractions import Fraction
import json

class EnhancedJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if dataclasses.is_dataclass(o):
            return dataclasses.asdict(o)
        if isinstance(o, TiffImagePlugin.IFDRational):
            return o.real
        if isinstance(o, Fraction):
            return int(o)
        return super().default(o)

with open(f"./data.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(images_metadata, cls=EnhancedJSONEncoder, ensure_ascii=False))
