In [4]:
# Les imports des bibliothèques
import time
import random
import re
import os
import logging
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [1]:
# Chemins et urls

DRIVER_PATH = r'C:\Users\lucie\OneDrive\Documents\dependance\chromedriver_win32\chromedriver'
SAVING_PATH = r'C:\Users\lucie\OneDrive\Documents\dependance\scraping'

OBJECT_URLS = [ "https://www.google.com/maps/place/Rectorat+de+l'acad%C3%A9mie+de+Lille/@50.6261288,3.0786046,17z/data=!3m1!4b1!4m6!3m5!1s0x47c32a77b52d7745:0x6571dfb526d328d9!8m2!3d50.6261254!4d3.0811795!16s%2Fg%2F1tvq38jm",
               ]

In [5]:
# Logging

logger = logging.getLogger('main')
logging.basicConfig(
    format='[%(asctime)s] [%(levelname)s] - %(message)s',
    datefmt='%H:%M:%S'
    )
logger.setLevel(logging.DEBUG)

In [6]:
# Le scraper

def scrape_an_object(object_url: str) -> tuple :

    # setting the chrome driver for selenium
    driver = webdriver.Chrome(service=Service(DRIVER_PATH))

    # opening the given URL
    logger.debug("Opening the given URL")
    driver.get(object_url)
    

    # accepting the cookies
    logger.debug("Accepting the cookies")
    driver.find_element(By.CLASS_NAME,"lssxud").click()

    # waiting some random seconds
    time.sleep(random.uniform(4,6))

    # CSS selectors
    object_name = driver.find_element(
        By.CSS_SELECTOR,
        'h1.DUwDvf.fontHeadlineLarge'
    ).text
    logger.debug(f'Object_name OK : {object_name}')

    object_address = driver.find_element(
        By.CSS_SELECTOR,
        'div.Io6YTe.fontBodyMedium'
    ).text
    logger.debug(f'Object_address OK : {object_address}')


    # for some reason sometimes google full randomly loads the page
    # with a slightly different page structure. to be able to handle this,
    # I created an except branch that scrapes the right objects in that scenario
    try:

        overall_rating = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice.mmu3tf'
        ).text.split()[0]
        logger.debug(f'Overall_rating OK : {overall_rating}')

        review_number = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice.mmu3tf'
        ).text.replace(' ','')

        review_number = int(re.compile(r'\d+').findall(review_number)[-1])
        logger.debug(f'Review_number OK : {review_number}')

        # click to load further reviews
        driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]/span[2]/span[1]/span'
        ).click()

        logger.debug('Clicked to load further reviews')
    
        time.sleep(random.uniform(0.1, 0.5))

        # find scroll layout
        scrollable_div = driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]'
        )

        logger.debug('Scroll div OK')
     
    except NoSuchElementException:

        logger.debug('Except branch')

        div_num_rating = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice'
        ).text
        overall_rating = div_num_rating.split()[0]
        logger.debug(f'Overall_rating OK : {overall_rating}')

        review_number = int(div_num_rating.split()[1].replace('(','').replace(')',''))
        logger.debug(f'Review_number OK : {review_number}')

        # click on the review tab
        driver.find_element(By.XPATH,'/html/body/div[3]/div[9]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]/div[1]').click()
        logger.debug('clicked to load further reviews')

        time.sleep(random.uniform(0.1, 0.5))

        # find scroll layout
        scrollable_div = driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]'
        )

        # Récupération du bouton "plus"
        #button = driver.find_element(By.XPATH, "//div[@class='w8nwRe kyuRq']")

        logger.debug('Scroll div OK')

    time.sleep(random.uniform(2,4))



    # scroll as many times as necessary to load all reviews
    for _ in range(0,(round(review_number/5 - 1)+1)):
        driver.execute_script(
            'arguments[0].scrollTop = arguments[0].scrollHeight',
            scrollable_div
        )
        time.sleep(random.uniform(1, 2))

    # parse the html with a bs object
    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews_source = response.find_all('div', class_='jJc9Ad')
    logger.debug('Source code has been parsed!')

    # closing the browser
    driver.close()

    # storing the data in a dict
    store_main_data = {'object_name': object_name,
                       'object_address': object_address,
                       'overall_rating': overall_rating,
                       'review_num': review_number,
                       'object_url':object_url}

    return store_main_data, reviews_source

"""

'\n    # I use CSS selectors where I can, because its more robust than XPATH\n    object_name = driver.find_element(\n        By.CSS_SELECTOR,\n        \'h1.DUwDvf.fontHeadlineLarge\'\n    ).text\n    logger.debug(f\'Object_name OK : {object_name}\')\n\n    object_address = driver.find_element(\n        By.CSS_SELECTOR,\n        \'div.Io6YTe.fontBodyMedium\'\n    ).text\n    logger.debug(f\'Object_address OK : {object_address}\')\n\n\n    # for some reason sometimes google full randomly loads the page\n    # with a slightly different page structure. to be able to handle this,\n    # I created an except branch that scrapes the right objects in that scenario\n    try:\n\n        overall_rating = driver.find_element(\n            By.CSS_SELECTOR,\n            \'div.F7nice.mmu3tf\'\n        ).text.split()[0]\n        logger.debug(f\'Overall_rating OK : {overall_rating}\')\n\n        review_number = driver.find_element(\n            By.CSS_SELECTOR,\n            \'div.F7nice.mmu3tf\'\n     