<h1 style='text-align:center'> Scarping With Beautiful Soup</h1>

In [249]:
# Importation des bibliothèques nécessaires
import requests
from bs4 import BeautifulSoup
from lxml import etree
import json
import pandas as pd
import os
from IPython.display import HTML

In [250]:
# Fonction pour obtenir le contenu HTML d'une page web
def soup_func(url):
    header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}
    r = requests.get(url, headers=header)
    return BeautifulSoup(r.text, 'lxml')

# Fonction pour rendre le contenu HTML plus lisible
def pretiffy(soup):
    return soup.prettify()

# Fonction pour extraire le texte d'un élément HTML en utilisant son chemin XPath
def xpath(soup, path):
    return etree.HTML(str(soup)).xpath(path)[0].text

# Fonction pour sauvegarder les données extraites dans un fichier JSON
def jump_data(path:str, data:list):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

# Fonction pour vérifier si un fichier existe déjà
def file_exist(path:str):
    return os.path.isfile(path)


In [251]:
# Fonction pour obtenir les liens de pagination de la page de tendances
def get_pagination_links():
    URL = 'https://openlibrary.org/trending/forever'
    soup = soup_func(URL)
    pagination_links = [link['href'] for link in soup.find_all('a', attrs={'class': 'ChoosePage'})][:-1:]
    pagination_links = ['/trending/forever?page=1'] + pagination_links
    return pagination_links

# Fonction pour extraire les détails des livres à partir des pages de pagination
def scrape(pagination_links):
    for pagination_link in pagination_links:
        data = []
        URL = f"https://openlibrary.org{pagination_link}"
        pagination_soup = soup_func(URL)
        list_group = pagination_soup.find_all('li', attrs={'class': 'searchResultItem'})
        links_detail_book_page = [link.find('a', attrs={'itemprop': 'url'})['href'] for link in list_group]
        for books_detail in links_detail_book_page:
            NEW_URL = f"https://openlibrary.org{books_detail}"
            new_soup = soup_func(NEW_URL)
            publishers = [publisher.text for publisher in new_soup.find_all('a', attrs={'itemprop': 'publisher'})]
            try:
                isbn = [isbn.text for isbn in new_soup.find_all('dd', attrs={'class': 'object', 'itemprop': 'isbn'})]
                isbn10 = isbn[0]
                isbn15 = isbn[1]
            except:
                isbn10 = None
                isbn15 = None
            try:
                page = new_soup.find('span', attrs={'class':'edition-pages' ,'itemprop': 'numberOfPages'}).text
            except:
                page = None
            try:
                language = [language.a.text for language in new_soup.find_all('span', attrs={'itemprop':'inLanguage'})]
            except ValueError:
                language = None
            try:
                title = new_soup.find('h1', attrs={'class':'work-title', 'itemprop':'name'}).text
            except:
                title = None
            book_data = {
                    'title': title,
                    'author': new_soup.find('a', attrs={'itemprop': 'author'}).text,
                    'publication_date': xpath(new_soup, '//*[@id="contentBody"]/div[1]/div[3]/div[5]/div/div[1]/span'),
                    'page': page,
                    'language': ' '.join(language),
                    'publishers': f"{', '.join(publishers)}",
                    'description': xpath(new_soup, '//*[@id="contentBody"]/div[1]/div[3]/div[4]/div/p[1]'),
                    'isbn10': isbn10,
                    'isbn15': isbn15,
                    'image':new_soup.find('img', attrs={'itemprop': 'image'})['src'],
                    'genre':[a.text for a in new_soup.find_all('a', attrs={'data-ol-link-track' : "BookOverview|SubjectClick"})]
                    }
            data.append(book_data)
    return jump_data('data/bs4/books.json', data)

# Fonction pour vérifier si le fichier 'books.json' existe déjà, sinon il lance le scraping
def case_not_file_exist(path:str='data/bs4/books.json'):
    if not file_exist(path):
        pagination_links = get_pagination_links()
        return scrape(pagination_links)
    else:
        return print("books.json already exists")

In [252]:
#pagination_links = get_pagination_links()
#scrape(pagination_links)

<h1 style='text-align:center'> Cleaning The Data</h1>

In [253]:
#verifie si le fichier 'books.json' existe deja ou pas avant de passer au nettoyage des données
case_not_file_exist()

books.json already exists


In [254]:
with open('data/bs4/books.json', 'r') as file:
    data = json.load(file)
    df = pd.json_normalize(data)
df

Unnamed: 0,title,author,publication_date,page,language,publishers,description,isbn10,isbn15,image,genre
0,Me Before You,Jojo Moyes,2016,,,JoJo,Louisa Clark is an ordinary young woman living...,,,https://archive.org/download/Net/page/cover_w1...,"[Young women, FICTION / Contemporary Women, FI..."
1,Shatter Me Complete Collection,Tahereh Mafi,2014,1327.0,English,HarperCollins Publishers Limited,Juliette can kill with a touch—will she wield ...,,,//covers.openlibrary.org/b/id/12986820-M.jpg,"[Children's fiction, Love, fiction, Soldiers, ..."
2,Quê hương tan rã,Chinua Achebe,2007,,Vietnamese,VHSG,Things Fall Apart is the debut novel by Nigeri...,,,//covers.openlibrary.org/b/id/14533991-M.jpg,"[20th century literature, Christianity, mascul..."
3,Storm eira,Jeff Kinney,2016,217.0,Welsh,Rily,Greg Heffley is in big trouble. School propert...,\n 1849672385\n,\n 9781849672382\n,//covers.openlibrary.org/b/id/12686385-M.jpg,"[New York Times bestseller, nyt:series_books=2..."
4,The Wrong Bride,Catharina Maura,"Oct 15, 2022",380.0,English,Ichara Publishing,She’s his fiancée’s younger sister. He’s the m...,\n 1955981183\n,\n 9781955981187\n,//covers.openlibrary.org/b/id/13161562-M.jpg,"[Fiction, Contemporary Romance, The Windsors s..."
5,Le Crime De L'Orient-Express,Agatha Christie,2015,,French,Librairie des Champs-Elysees,,\n 2702436331\n,\n 9782702436332\n,//covers.openlibrary.org/b/id/14586379-M.jpg,"[Agatha Christie, Private investigator, P.I., ..."
6,L'Étranger,Albert Camus,1963,,French,Methuen & Co. Ltd.,,,,//covers.openlibrary.org/b/id/13610145-M.jpg,"[Murder, Fiction, Medicine in Literature, Fren..."
7,La Métamorphose,Franz Kafka,"January 4, 1994",183.0,French,Flammarion,Metamorphosis (German: Die Verwandlung) is a n...,\n 2080705105\n,\n 9782080705105\n,//covers.openlibrary.org/b/id/971344-M.jpg,"[Fantasy fiction, Children's fiction, Lectures..."
8,Double Down,Jeff Kinney,2016,224.0,English,"Turtleback Books Publishing, Limited",The pressure's really piling up on Greg Heffle...,,,//covers.openlibrary.org/b/id/14589618-M.jpg,"[Motion pictures, Humorous Stories, JUVENILE F..."
9,La pequeña oruga glotona cartoné pequeña,Eric Carle,"Sep 08, 2008",26.0,,Editorial Kókinos,"One sunny day, a caterpillar pops out of an eg...",\n 8488342330\n,\n 9788488342331\n,//covers.openlibrary.org/b/id/11201899-M.jpg,"[Children's fiction, Caterpillars, fiction, To..."


In [255]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             20 non-null     object
 1   author            20 non-null     object
 2   publication_date  20 non-null     object
 3   page              14 non-null     object
 4   language          20 non-null     object
 5   publishers        20 non-null     object
 6   description       16 non-null     object
 7   isbn10            9 non-null      object
 8   isbn15            9 non-null      object
 9   image             20 non-null     object
 10  genre             20 non-null     object
dtypes: object(11)
memory usage: 1.8+ KB


In [256]:
new_df = df.copy()

In [257]:
def not_information(column:str):
    return new_df[column].str.replace('None', 'Not Enough Information')

def url_to_img_html(url):
    return f'<img src="{url}" width="200" >'  

In [258]:
new_df['genre'] = new_df['genre'].apply(lambda x: ', '.join(x))
new_df['image'] = new_df['image'].apply(lambda x: 'https:' + x if x.startswith('//') else x)
new_df['language'] = new_df['language'].apply(lambda x: 'Not Enough Information' if x == '' else x)
new_df['isbn10'] = new_df['isbn10'].apply(lambda x:  str(x).strip())
new_df['isbn15'] = new_df['isbn15'].apply(lambda x:  str(x).strip())
new_df['page'] = new_df['page'].astype(str)
new_df['description'] = not_information('description')
new_df['isbn10'] = not_information('isbn10')
new_df['isbn15'] = not_information('isbn15')

# Convert image URLs to HTML tags
new_df['image'] = new_df['image'].apply(url_to_img_html)


In [259]:
new_df

Unnamed: 0,title,author,publication_date,page,language,publishers,description,isbn10,isbn15,image,genre
0,Me Before You,Jojo Moyes,2016,,Not Enough Information,JoJo,Louisa Clark is an ordinary young woman living...,Not Enough Information,Not Enough Information,"<img src=""https://archive.org/download/Net/pag...","Young women, FICTION / Contemporary Women, FIC..."
1,Shatter Me Complete Collection,Tahereh Mafi,2014,1327.0,English,HarperCollins Publishers Limited,Juliette can kill with a touch—will she wield ...,Not Enough Information,Not Enough Information,"<img src=""https://covers.openlibrary.org/b/id/...","Children's fiction, Love, fiction, Soldiers, f..."
2,Quê hương tan rã,Chinua Achebe,2007,,Vietnamese,VHSG,Things Fall Apart is the debut novel by Nigeri...,Not Enough Information,Not Enough Information,"<img src=""https://covers.openlibrary.org/b/id/...","20th century literature, Christianity, masculi..."
3,Storm eira,Jeff Kinney,2016,217.0,Welsh,Rily,Greg Heffley is in big trouble. School propert...,1849672385,9781849672382,"<img src=""https://covers.openlibrary.org/b/id/...","New York Times bestseller, nyt:series_books=20..."
4,The Wrong Bride,Catharina Maura,"Oct 15, 2022",380.0,English,Ichara Publishing,She’s his fiancée’s younger sister. He’s the m...,1955981183,9781955981187,"<img src=""https://covers.openlibrary.org/b/id/...","Fiction, Contemporary Romance, The Windsors se..."
5,Le Crime De L'Orient-Express,Agatha Christie,2015,,French,Librairie des Champs-Elysees,,2702436331,9782702436332,"<img src=""https://covers.openlibrary.org/b/id/...","Agatha Christie, Private investigator, P.I., H..."
6,L'Étranger,Albert Camus,1963,,French,Methuen & Co. Ltd.,,Not Enough Information,Not Enough Information,"<img src=""https://covers.openlibrary.org/b/id/...","Murder, Fiction, Medicine in Literature, Frenc..."
7,La Métamorphose,Franz Kafka,"January 4, 1994",183.0,French,Flammarion,Metamorphosis (German: Die Verwandlung) is a n...,2080705105,9782080705105,"<img src=""https://covers.openlibrary.org/b/id/...","Fantasy fiction, Children's fiction, Lectures ..."
8,Double Down,Jeff Kinney,2016,224.0,English,"Turtleback Books Publishing, Limited",The pressure's really piling up on Greg Heffle...,Not Enough Information,Not Enough Information,"<img src=""https://covers.openlibrary.org/b/id/...","Motion pictures, Humorous Stories, JUVENILE FI..."
9,La pequeña oruga glotona cartoné pequeña,Eric Carle,"Sep 08, 2008",26.0,Not Enough Information,Editorial Kókinos,"One sunny day, a caterpillar pops out of an eg...",8488342330,9788488342331,"<img src=""https://covers.openlibrary.org/b/id/...","Children's fiction, Caterpillars, fiction, Toy..."


In [260]:
# Convert DataFrame to HTML and display
HTML(new_df.to_html(escape=False))

Unnamed: 0,title,author,publication_date,page,language,publishers,description,isbn10,isbn15,image,genre
0,Me Before You,Jojo Moyes,2016,,Not Enough Information,JoJo,"Louisa Clark is an ordinary young woman living an exceedingly ordinary life—steady boyfriend, close family—who has never been farther afield than their tiny village. She takes a badly needed job working for ex-Master of the Universe Will Traynor, who is wheelchair-bound after an accident. Will has always lived a huge life—big deals, extreme sports, worldwide travel—and now he’s pretty sure he cannot live the way he is.\n",Not Enough Information,Not Enough Information,,"Young women, FICTION / Contemporary Women, FICTION / Romance / Contemporary, Fiction, Paraplegics, Fiction, women, Fiction, romance, contemporary, Young women, fiction, People with disabilities, fiction, Contemporary Women, Entrepreneurship, Paralysis, Contemporary, Emotions, Caregivers, Romance, Business enterprises, People with disabilities, Wheelchairs, FICTION, Love stories, Quadriplegics, Large type books, nyt:trade-fiction-paperback=2013-08-18, New York Times bestseller, New York Times reviewed, Interpersonal relations, Man-woman relationships, Romance fiction, Man-woman relations, Life change events, nyt:mass-market-paperback=2016-05-15, Success, Relaciones hombre-mujer, Bossiness, English Romance fiction, Parapléjicos, Jóvenes (Mujeres), Ficción, Specimens, Exito, English language, Men with disabilities, Spanish language materials, Translations, Novela, Cuidadores, Paralytics"
1,Shatter Me Complete Collection,Tahereh Mafi,2014,1327.0,English,HarperCollins Publishers Limited,"Juliette can kill with a touch—will she wield her power for good, or will it turn her into the monster she’s always feared she truly is? Find out in the New York Times and USA Today bestselling Shatter Me series—all six novels are now available in this paperback box set!\n",Not Enough Information,Not Enough Information,,"Children's fiction, Love, fiction, Soldiers, fiction, Science fiction"
2,Quê hương tan rã,Chinua Achebe,2007,,Vietnamese,VHSG,"Things Fall Apart is the debut novel by Nigerian author Chinua Achebe, first published in 1958. It depicts pre-colonial life in the southeastern part of Nigeria and the arrival of Europeans during the late 19th century. It is seen as the archetypal modern African novel in English, and one of the first to receive global critical acclaim. It is a staple book in schools throughout Africa and is widely read and studied in English-speaking countries around the world. The novel was first published in the UK in 1962 by William Heinemann Ltd, and became the first work published in Heinemann's African Writers Series.\n",Not Enough Information,Not Enough Information,,"20th century literature, Christianity, masculinity, British colonialism, LANGUAGE & LITERARY STUDIES, CONTEMPORARY FICTION, literature, Race relations, Men, British, Fiction, African fiction, Nigerian fiction, colonization, historical fiction, Ficción, Hombres, Missionaries, Novela histórica, Readers (Adult), Relaciones raciales, Social life and customs, Tribes, open_syllabus_project, Long Now Manual for Civilization, African literature, audiobooks, Igbo (African people), Classics, Juvenile Fiction, Children: Young Adult (Gr. 10-12), Skönlitteratur, Briten, Missionärer, Igbo (folk), FICTION / Political, Kolonialismus, FICTION / Literary, Fiction (fictional works by one author), Nigeria, fiction, Fiction, general, Romans, Hommes, Relations raciales, Britanniques, Fiction, historical, general, Criticism and interpretation, Readers (Secondary), History, Wrestlers, Igbo (african people)--fiction, Wrestlers--nigeria--fiction, 823, Literary, Political, Cultural heritage, Historical, English & college success -> english -> fiction"
3,Storm eira,Jeff Kinney,2016,217.0,Welsh,Rily,"Greg Heffley is in big trouble. School property has been damaged, and Greg is the prime suspect. But the crazy thing is, he’s innocent. Or at least sort of.\n",1849672385,9781849672382,,"New York Times bestseller, nyt:series_books=2009-01-10, Diary Of A Wimpy Kid Book 06 Cabin Fever, Families, Middle schools, Diary fiction, Juvenile fiction, Holidays, Blizzards, Boredom, Children's stories, Greg Heffley, Fiction, Friendship, Child and youth fiction, Children's fiction, Diaries, fiction, Humorous stories, Behavior, fiction, Schools, fiction, Family life, fiction, Holidays, fiction, Friendship, fiction, Diaries, JUVENILE FICTION / Humorous Stories, Family life, Behavior, JUVENILE FICTION / Comics & Graphic Novels / General, Schools, Christmas, Humor, Vandalism, Fear, Comics & Graphic Novels, Juvenile Wit and humor, General, Christmas stories"
4,The Wrong Bride,Catharina Maura,"Oct 15, 2022",380.0,English,Ichara Publishing,She’s his fiancée’s younger sister. He’s the man she’s always loved.\n,1955981183,9781955981187,,"Fiction, Contemporary Romance, The Windsors series"
5,Le Crime De L'Orient-Express,Agatha Christie,2015,,French,Librairie des Champs-Elysees,,2702436331,9782702436332,,"Agatha Christie, Private investigator, P.I., Hercule Poirot, Detective, Murder, Mystery, Juvenile, Humour, Fiction, Literature, Stories, Private investigators, Private investigators in fiction, Travel, Railroad, Trains, Express trains, Orient Express, Littérature anglaise, Roman anglais, Orient Express (Express train), Regény, Angol irodalom, Hercule Poirot (Fictitious character), Detective and mystery stories, Railroad stories, Railroad travel, Poirot, Hercule (Personaje literario), Novela policíaca, Ficción, Investigation, Juvenile fiction, Poirot, hercule (fictitious character), fiction, Private investigators, fiction, Fiction, mystery & detective, traditional, French imprints, Translations into French, English Detective and mystery stories, French fiction, English fiction, English literature, Readers, English language, textbooks for foreign speakers, Hercule Poirot (Fictional character), Mystery fiction, Large type books, Dorian. Grey (Fictitious character), Détectives, Romans, nouvelles, Orient-Express (Train rapide), Meurtre, Enquêtes, Spanish language materials, Novela, Detectives privados, Comics & graphic novels, crime & mystery"
6,L'Étranger,Albert Camus,1963,,French,Methuen & Co. Ltd.,,Not Enough Information,Not Enough Information,,"Murder, Fiction, Medicine in Literature, French, Novelas francesas, Asesinato, Ficción, Relatos de aventuras, Philosophical Novels, Critique et interprétation, Social conditions, Camus, Albert, 1913-1960. L'étranger, Criticism and interpretation, French language materials, Algeria, fiction, Fiction, general, Homicide, Adventure stories, Action and adventure fiction, Continental european fiction (fictional works by one author), Trials (Murder), Young men, History, Fictional Works Publication Type, Nobel Prize for Literature laureate, Literary, Translations into English, Psychological, Fictional Works [Publication Type], Fictional Works, Classics, French fiction, Fiction, psychological, Large type books, Argelia, Novela, Relatos de aventura, Social Marginality, Romance literature, Death, Juvenile fiction, Happiness, Life, French language, Murder in fiction, Algeria in fiction, Angestellter, Mord, Todesstrafe, Étranger (Camus, Albert), Comics & graphic novels, literary, Novela francesa, Français, Romans, nouvelles, England, fiction, French Philosophy, Murder--fiction, Pq2605.a3734 e813 1989, 843/.914, World literature, Fiction subjects, Peoples & cultures - fiction"
7,La Métamorphose,Franz Kafka,"January 4, 1994",183.0,French,Flammarion,"Metamorphosis (German: Die Verwandlung) is a novella written by Franz Kafka which was first published in 1915. One of Kafka's best-known works, Metamorphosis tells the story of salesman Gregor Samsa, who wakes one morning to find himself inexplicably transformed into a huge insect (German: ungeheueres Ungeziefer, lit. ""monstrous vermin"") and subsequently struggles to adjust to this new condition. The novella has been widely discussed among literary critics, with differing interpretations being offered. In popular culture and adaptations of the novella, the insect is commonly depicted as a cockroach.\n",2080705105,9782080705105,,"Fantasy fiction, Children's fiction, Lectures et morceaux choisis, Fantastique, Nouvelles, Métamorphose, Romans, Famille, Metamorphosis, Die Verwandlung, an interpretation, Kikuyu (African people), Native races, Afforestation, Indigenous peoples, Readers, Textbooks for foreign speakers, Fiction, German fiction, Texts, Social problems, English, German language, Continental european fiction (fictional works by one author), Fiction, psychological, Alienación (Psicología social), Alienation (Social psychology), Criticism and interpretation, Crítica e interpretación, Familia, Family, Ficción, Insectos, Insects, Sales personnel, Vendedores, Translations into English, Families, Fiction, short stories (single author), Kafka, franz, 1883-1924, Korean, Dictionaries, English language, Psychological fiction, German, Trials, Spanish, French, Chinese, Fiction, general, FICTION / Classics, LITERARY CRITICISM / General, FICTION / Literary, German language, readers, Classic Literature, Nouvelles fantastiques, Germanic literature, Fiction, science fiction, general, Alienation (Psychology), Fiction, historical, general, Verwandlung (Kafka, Franz), Allemand (Langue), Textes, Manuels pour anglophones, Problèmes sociaux, Romans, nouvelles, Austrian Short stories, Fiction In Translation, Kafka, franz , 1883-1924, Criticism and interpretationkafka, franz , 1883-1924, Metamorphosis--fiction, Alienation (social psychology)--fiction, Working class families, Working class families--fiction, German fiction--20th century"
8,Double Down,Jeff Kinney,2016,224.0,English,"Turtleback Books Publishing, Limited","The pressure's really piling up on Greg Heffley. His mom thinks video games are turning his brain to mush, so she wnats her son to put down the controller and explore his ""creative side"".\n",Not Enough Information,Not Enough Information,,"Motion pictures, Humorous Stories, JUVENILE FICTION, JUVENILE FICTION / Humorous Stories, Comics & Graphic Novels, Video games, Family life, Fiction, Comics & Graphic Novels - General, JUVENILE FICTION / Social Themes / Friendship, Families, Diaries, Halloween, Children's Books/Ages 9-12 Fiction, General, JUVENILE FICTION / Comics & Graphic Novels / General, Production and direction, Cloth or Hardcover, Children's fiction, Halloween, fiction, Games, fiction, Family, fiction, Diaries, fiction, Motion pictures, fiction, Schools, Friendship, Children's stories, Diary Fiction, Drama, Middle school, Diary Of A Wimpy Kid Book 11 Double Down, Family life, fiction, Literature and fiction, juvenile, Comedy, Heffley, Greg -- Juvenile fiction, Heffley, Greg, Large type books, Diaries -- Fiction, Family life -- Fiction, Motion pictures -- Production and direction -- Fiction, Halloween -- Fiction, Humorous stories, JUVENILE FICTION -- Humorous Stories, JUVENILE FICTION -- Comics & Graphic Novels -- General"
9,La pequeña oruga glotona cartoné pequeña,Eric Carle,"Sep 08, 2008",26.0,Not Enough Information,Editorial Kókinos,"One sunny day, a caterpillar pops out of an egg. He is very hungry and begins searching for food. He eats his way through ten very sweet pages and gets a tummy ache before finally finding a good, healthy leaf, which makes him sleepy. Then something really amazing happens. But you will have to read it your self to find out what!\n",8488342330,9788488342331,,"Children's fiction, Caterpillars, fiction, Toy and movable books, Fiction, Caterpillars, German language materials, Italian language materials, Specimens, Translations into Chinese, Chinese language edition, Caterpillar, Translations into Vietnamese, Metamorphosis, Spanish language materials, Children's stories, American, Open Library Staff Picks, Children's stories, English, Butterflies, Juvenile, Juvenile fiction, Board books, Translations into Urdu, Children's stories, Chinese, Libros de juguete y movibles, Muestras, Orugas, Especímenes, Novela juvenil, Libros de juguete y móviles, Butterflies, fiction, Picture books for children, Flowers, fiction, Fiction.., Littérature de jeunesse anglaise, Chenilles, Caterpillers, Papillons, Metamorphis, Ouvrages illustrés, Métamorphose, nyt:picture-books=2009-04-05, New York Times bestseller, Metamorfosis (Biología), Mariposas, Ficción juvenil, Bilingual, Child and youth fiction, Picture books, Animals, Children's stories, Pictorial works, Touch, Textured books, Histoires pour enfants, Livres tout carton, Toucher, Livres tactiles, Children's Books/Baby-Preschool, Espagnol (langue), Ouvrages pour la jeunesse, Children's stories, Kyrgyz"


<h1 style='text-align:center'> Scarping With Selenium</h1>

In [None]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd

In [None]:

# from selenium.webdriver.edge.options import Options

options = webdriver.ChromeOptions() #.EdgeOptions()

# options.add_argument('--headless')
service = Service(executable_path='driver/chrome/chromedriver-linux64/chromedriver')
driver = webdriver.Chrome(service=service, options=options)#.Edge(service=service, options=options)
driver.get('https://openlibrary.org/')

wait = WebDriverWait(driver, 50)

# selection des navbars à explorer par le bot
explorer_menu = wait.until(EC.element_to_be_clickable((By.XPATH, '//summary[contains(text(), "Explorer")]')))
en_vogue_navbar = driver.find_element(By.XPATH, '//*[@id="header-bar"]/ul[1]/li[2]/div/details/div/ul/li[2]/a')

# cliquer sur les navigations
explorer_menu.click()
en_vogue_navbar.click()

driver.implicitly_wait(7)
# selectionner la navigation 'de tous les temps'
de_tous_temps_tri = wait.until(EC.element_to_be_clickable(driver.find_element(By.XPATH, '//*[@id="contentBody"]/ul/li[6]/a')))
de_tous_temps_tri.click()

# Liste pour stocker les données des livres
data = []

pagination_pages = len(driver.find_elements(By.CLASS_NAME, 'ChoosePage')[:-1])
print(pagination_pages)
counter = 0

# Remplacez 'books_link' par la méthode appropriée pour obtenir les liens des livres
while True:
    books_link = driver.find_elements(By.CSS_SELECTOR, '.results[itemprop="url"]')
    for i in range(len(books_link)):
        try:
            books_link[i].click()
        except StaleElementReferenceException:
            books_link = driver.find_elements(By.CSS_SELECTOR, '.results[itemprop="url"]')
            books_link[i].click()
        # Extraction des informations
        try:
            title = driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div[1]/div[3]/div[2]/span/h1').text
        except:
            title = None
        
        try:
            author = driver.find_element(By.CSS_SELECTOR, '[itemprop="author"]').get_attribute('textContent').strip()#.text
        except:
            author = None

        try:
            publication_date = driver.find_element(By.XPATH, '//*[@id="contentBody"]/div[1]/div[3]/div[5]/div/div[1]/span').text
        except:
            publication_date = None

        try:
            page = driver.find_element(By.CSS_SELECTOR, '[itemprop="numberOfPages"]').text
        except:
            page = None

        try:
            language = [language.get_attribute('textContent').strip() for language in driver.find_elements(By.XPATH, '//*[@id="contentBody"]/div[1]/div[3]/div[5]/div/div[3]/span/a')]
        except:
            language = None

        try:
            publishers = [publisher.get_attribute('textContent').strip() for publisher in driver.find_elements(By.CSS_SELECTOR, '[itemprop="publisher"]')]
        except:
            publishers = None

        try:
            description = driver.find_element(By.XPATH, '//*[@id="contentBody"]/div[1]/div[3]/div[4]/div/p[1]').text
        except:
            description = None

        try:
            isbn = [isbn.text for isbn in driver.find_elements(By.CSS_SELECTOR, '[itemprop="isbn"]')]
            isbn10 = isbn[0] if len(isbn) > 0 else None
            isbn15 = isbn[1] if len(isbn) > 1 else None
        except:
            isbn10 = None
            isbn15 = None

        try:
            image = driver.find_element(By.CSS_SELECTOR, '[itemprop="image"]').get_attribute('src')
        except:
            image = None

        try:
            genre = [a.get_attribute('textContent').strip() for a in driver.find_elements(By.CSS_SELECTOR, '[data-ol-link-track="BookOverview|SubjectClick"]')]
        except:
            genre = None

        # Création du dictionnaire de données du livre
        book_data = {
            'title': title,
            'author': author,
            'publication_date': publication_date,
            'page': page,
            'language': ' '.join(language) if language else None,
            'publishers': ', '.join(publishers) if publishers else None,
            'description': description,
            'isbn10': isbn10,
            'isbn15': isbn15,
            'image': image,
            'genre': genre
        }

        # Ajout des données du livre à la liste
        data.append(book_data)
        driver.back()
    if counter < pagination_pages:
        driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div[2]/div[2]/a[last()]').click()
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.results[itemprop="url"]')))
        counter += 1
    else:
        break
driver.quit()
print(data)
