# Webscraping

In [None]:
import requests

In [None]:
url = "https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Accueil_principal"

response = requests.get(url)

print(response)

<Response [200]>


In [None]:
print(response.status_code)

if response.status_code == 200:
  print(response.content)
  print(response.text)

## BeautifulSoup



In [None]:
!pip install beautifulsoup4



In [None]:
# import bs4
from bs4 import BeautifulSoup

In [None]:
page_web = BeautifulSoup(response.content, "html.parser")

print(page_web)

In [None]:
headings = page_web.find_all("h1")

print(headings)

[<h1 class="firstHeading mw-first-heading" id="firstHeading">Bienvenue sur Wikipédia</h1>]


In [None]:
main_heading = page_web.find(id = "firstHeading")

print(main_heading)

<h1 class="firstHeading mw-first-heading" id="firstHeading">Bienvenue sur Wikipédia</h1>


In [None]:
headings = page_web.find_all(class_ = "mw-content-ltr")

print(headings)

In [None]:
h2 = page_web.find_all("h2")[0]
spans = h2.find_all("span")


h2.nextSibling()


print(h2)
print(spans)

<h2 data-mw-thread-id="h-Wikipédia" id="Wikipédia"><span id="Wikip.C3.A9dia"></span><span data-mw-comment-start="" id="h-Wikipédia"></span>Wikipédia<span data-mw-comment-end="h-Wikipédia"></span></h2>
[<span id="Wikip.C3.A9dia"></span>, <span data-mw-comment-start="" id="h-Wikipédia"></span>, <span data-mw-comment-end="h-Wikipédia"></span>]


In [None]:
heading = page_web.find("h1")


print(heading)
print(heading.string)

print(heading.get("id"))
print(heading.get("class"))

print(heading.get_text())

<h1 class="firstHeading mw-first-heading" id="firstHeading">Bienvenue sur Wikipédia</h1>
Bienvenue sur Wikipédia
firstHeading
['firstHeading', 'mw-first-heading']
Bienvenue sur Wikipédia


In [None]:
h2 = page_web.find_all("h2")[0]

print(h2)
print(h2.get_text())

<h2 data-mw-thread-id="h-Wikipédia" id="Wikipédia"><span id="Wikip.C3.A9dia"></span><span data-mw-comment-start="" id="h-Wikipédia"></span>Wikipédia<span data-mw-comment-end="h-Wikipédia"></span></h2>
Wikipédia


In [None]:
heading = page_web.find("h1")

print(heading)

<h1 class="firstHeading mw-first-heading" id="firstHeading">Bienvenue sur Wikipédia</h1>


## Wiki scraper

https://fr.wikipedia.org/wiki/Special:Random

In [None]:
rep = requests.get("https://fr.wikipedia.org/wiki/Special:Random")

if rep.status_code == 200:
  print(rep.url)

  bs = BeautifulSoup(rep.content, "html.parser")
  heading = bs.find(id = "firstHeading").get_text()

  print(heading)

https://fr.wikipedia.org/wiki/Sixte_Estko
Sixte Estko


## Telecharger une image

In [None]:
import requests

def telecharger_img(src, chemin):
  rep = requests.get(src, stream = True)
  if rep.status_code == 200 or rep.status_code == 206:
    with open(chemin, "wb") as f:
      for chunk in rep.iter_content():
        f.write(chunk)

In [None]:
import urllib

def download_image(url, outpath):
  img = urllib.request.urlopen(url)
  with open(outpath, "wb") as f:
    f.write(img.read())

In [None]:
import os

image_url = "https://facts.net/wp-content/uploads/2023/09/24-facts-about-hans-moleman-the-simpsons-1694245967.jpg"
chemin_de_sortie = os.path.join(os.getcwd(), "image_out_2.jpg")

telecharger_img(image_url, chemin_de_sortie)


In [None]:
rep = requests.get("https://fr.wikipedia.org/wiki/Special:Random")
rep = requests.get("https://fr.wikipedia.org/wiki/Cath%C3%A9drale_Notre-Dame_de_Paris")



if rep.status_code == 200:
  print(rep.url)

  bs = BeautifulSoup(rep.content, "html.parser")
  heading = bs.find(id = "firstHeading").get_text()

  print(heading)

  imgs = bs.find_all("img")

  for i, image in enumerate(imgs):
    src_raw = image.get("src")
    if src_raw[:8] != "/static/" and src_raw[:8] != "https://" and src_raw[:3] != "/w/":
      src = "https:" + src_raw
      download_image(src, os.path.join(os.getcwd(), f"{i}.jpg"))

## Exercice pour la prochaine fois

- Creer un script qui telecharge l'image principal d'une page wikipedia (si elle existe)
- Creer un script qui telecharge toutes les images d'une page wikipedia.
- Refaire ces execercises pour le site de votre choix.

## API

4 operations de base:

- **get**: recuperer des données existantes
- **post**: ajouter des nouvelles données
- **put**: modifier des données existantes
- **delete**: supprimer des données existantes

https://www.nakala.fr/10.34847/nkl.74ca53br

In [7]:
import requests

nakala_api_prefix = "https://api.nakala.fr"
data_id = "10.34847/nkl.74ca53br"

# Construire mon URL
# https://api.nakala.fr/datas/10.34847/nkl.74ca53br

url = nakala_api_prefix + "/datas/" + data_id
print(url)

url = f"{nakala_api_prefix}/datas/{data_id}"
print(url)

# Que pour les utilisateurs de mac:
import os
url = os.path.join(nakala_api_prefix, "datas", data_id)
print(url)

https://api.nakala.fr/datas/10.34847/nkl.74ca53br
https://api.nakala.fr/datas/10.34847/nkl.74ca53br
https://api.nakala.fr/datas/10.34847/nkl.74ca53br


In [11]:
import json

def save_json(donnes, chemin):
  f = open(chemin, "w")
  # f.write(mon_dict)
  json.dump(donnes, f, indent = 4)
  f.close()

In [12]:
rep = requests.get(url)

print(rep.status_code)
data_json = rep.json()

print(data_json)

save_json(data_json, os.path.join(os.getcwd(), "output.json"))

200
{'version': 1, 'collectionsIds': ['10.34847/nkl.028ez5o3', '10.34847/nkl.8a0137dx', '10.34847/nkl.d9ee90eo', '10.34847/nkl.c26bn9o9', '10.34847/nkl.0c7bn47w', '10.34847/nkl.ce8fxt7l'], 'files': [{'name': 'IND-LAD-PHL-Zo01_Ma01.jpg', 'extension': 'jpg', 'size': '125706', 'mime_type': 'image/jpeg', 'sha1': 'f606408e7bc1e39ea3c35298204d76af80e4712c', 'embargoed': '2023-08-17T00:00:00+02:00', 'description': None, 'humanReadableEmbargoedDelay': [], 'puid': 'fmt/645'}], 'lastModerator': None, 'lastModerationDate': None, 'relations': [], 'status': 'published', 'fileEmbargoed': False, 'uri': 'https://doi.org/10.34847/nkl.74ca53br', 'identifier': '10.34847/nkl.74ca53br', 'metas': [{'value': 'IND-LAD-PHL-Zo01_Ma01, Himalayan Rock Art Database', 'lang': None, 'typeUri': None, 'propertyUri': 'http://nakala.fr/terms#title'}, {'value': '2008', 'lang': None, 'typeUri': None, 'propertyUri': 'http://nakala.fr/terms#created'}, {'value': 'CC-BY-NC-4.0', 'lang': None, 'typeUri': None, 'propertyUri': '

## Search

In [16]:
search_url = f"{nakala_api_prefix}/search?q=maps%20himalayas"

rep = requests.get(search_url)

if rep.status_code == 200:
  search_results = rep.json()
  save_json(search_results, os.path.join(os.getcwd(), "search.json"))

  for data in search_results["datas"]:
    data_url = data["identifier"]

    url = f"{nakala_api_prefix}/datas/{data_url}"

    rep_data = requests.get(url)
    if rep_data.status_code == 200:
      data_data = rep_data.json()
      print(data_data["status"])

published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
published
