In [1]:
# Les imports des bibliothèques
import time
import random
import re
import os
import logging
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [2]:
# Chemins et urls

DRIVER_PATH = r'C:\Users\lucie\OneDrive\Documents\dependance\chromedriver_win32\chromedriver'
SAVING_PATH = r'C:\Users\lucie\OneDrive\Documents\dependance\scraping'

OBJECT_URLS = [ #"https://www.google.com/maps/place/Rectorat+de+l'acad%C3%A9mie+de+Lille/@50.6261288,3.0786046,17z/data=!3m1!4b1!4m6!3m5!1s0x47c32a77b52d7745:0x6571dfb526d328d9!8m2!3d50.6261254!4d3.0811795!16s%2Fg%2F1tvq38jm",
               "https://www.google.com/maps/"
               ]

In [3]:
# Logging

logger = logging.getLogger('main')
logging.basicConfig(
    format='[%(asctime)s] [%(levelname)s] - %(message)s',
    datefmt='%H:%M:%S'
    )
logger.setLevel(logging.DEBUG)

In [4]:
# Le scraper sans location

def scrape_an_object(object_url: str, location: str) -> tuple :

    # setting the chrome driver for selenium
    driver = webdriver.Chrome(service=Service(DRIVER_PATH))

    # opening the given URL
    logger.debug("Opening the given URL")
    driver.get(object_url)
    

    # accepting the cookies
    logger.debug("Accepting the cookies")
    driver.find_element(By.CLASS_NAME,"lssxud").click()

    # writting the location in the search box
    time.sleep(2) 
    select_box = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
    select_box.send_keys(location)
    select_box.send_keys(Keys.ENTER)

    # waiting some random seconds
    time.sleep(random.uniform(4,6))

    # CSS selectors
    object_name = driver.find_element(
        By.CSS_SELECTOR,
        'h1.DUwDvf.fontHeadlineLarge'
    ).text
    logger.debug(f'Object_name OK : {object_name}')

    object_address = driver.find_element(
        By.CSS_SELECTOR,
        'div.Io6YTe.fontBodyMedium'
    ).text
    logger.debug(f'Object_address OK : {object_address}')


    # for some reason sometimes google full randomly loads the page
    # with a slightly different page structure. to be able to handle this,
    # I created an except branch that scrapes the right objects in that scenario
    try:

        overall_rating = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice.mmu3tf'
        ).text.split()[0]
        logger.debug(f'Overall_rating OK : {overall_rating}')

        review_number = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice.mmu3tf'
        ).text.replace(' ','')

        review_number = int(re.compile(r'\d+').findall(review_number)[-1])
        logger.debug(f'Review_number OK : {review_number}')

        # click to load further reviews
        driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]/span[2]/span[1]/span'
        ).click()

        logger.debug('Clicked to load further reviews')
    
        time.sleep(random.uniform(0.1, 0.5))

        # find scroll layout
        scrollable_div = driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]'
        )

        logger.debug('Scroll div OK')
     
    except NoSuchElementException:

        logger.debug('Except branch')

        div_num_rating = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice'
        ).text
        overall_rating = div_num_rating.split()[0]
        logger.debug(f'Overall_rating OK : {overall_rating}')

        review_number = int(div_num_rating.split()[1].replace('(','').replace(')',''))
        logger.debug(f'Review_number OK : {review_number}')

        # click on the review tab
        driver.find_element(By.XPATH,'/html/body/div[3]/div[9]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]/div[1]').click()
        logger.debug('clicked to load further reviews')

        time.sleep(random.uniform(0.1, 0.5))

        # find scroll layout
        scrollable_div = driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]'
        )

        logger.debug('Scroll div OK')

    time.sleep(random.uniform(2,4))

    # button lire plus
    button_lire_plus = driver.find_elements(By.CLASS_NAME,'w8nwRe.kyuRq')
    for i in button_lire_plus:
        i.click()

    # scroll as many times as necessary to load all reviews
    for _ in range(0,(round(review_number/5 - 1)+1)):
        driver.execute_script(
            'arguments[0].scrollTop = arguments[0].scrollHeight',
            scrollable_div
        )
        time.sleep(random.uniform(1, 2))

    # parse the html with a bs object
    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews_source = response.find_all('div', class_='jJc9Ad')
    logger.debug('Source code has been parsed!')

    # closing the browser
    driver.close()

    # storing the data in a dict
    store_main_data = {'object_name': object_name,
                       'object_address': object_address,
                       'overall_rating': overall_rating,
                       'review_num': review_number,
                       'object_url':object_url}

    return store_main_data, reviews_source


In [5]:
def extract_reviews(reviews_source: list) -> list:

    r"""
    This method processes the input html code and returns a list 
    containing the reviews.

    """

    review_list = []

    logger.debug('Starting iterate trough the reviews...')
    for review in reviews_source:

        # extract the relevant informations
        user = review.find('div', class_= 'd4r55').text.strip()
        date = review.find('span', class_= 'rsqaWe').text.strip()
        rate = len(review.find('span',class_ = 'kvMYJc'))
        review_text = review.find('span', class_= 'wiI7pd')
        review_text = '' if review_text is None else review_text.text 
        reply_source = review.find('div', class_= 'CDe7pd')
        reply = reply_source.text if reply_source else '-'


        review_list.append({'name': user,
                            'date': date,
                            'rate': rate,
                            'review_text': review_text,
                            'reply': reply})

    return review_list


In [9]:

def main():

    scraped_data =  []

    # loop trough the urls and calling the necessary functions to populate the empty scraped_data list
    for i, url in enumerate(OBJECT_URLS):
        try:
            time.sleep(random.uniform(3,10))
            
            store_main_data, reviews_source = scrape_an_object(url,)
            scraped_data.append(store_main_data)

            review_list = extract_reviews(reviews_source)
            scraped_data[i]['reviews'] = review_list
            print (scraped_data[i]['review_num'], len(scraped_data[i]['reviews']))

            if scraped_data[i]['review_num'] != len(scraped_data[i]['reviews']):
                logger.warning(f'For some reason not all the reviews had been scraped for the following object: {store_main_data["object_name"]}')


        except Exception as exception:
            logger.error(f'{url} \n {exception}')
            scraped_data.append(
                    {'object_name': 'Error',
                    'object_address': 'Error',
                    'overall_rating': 'None',
                    'review_num': 'None',
                    'object_url':url,
                    'reviews':[{}]
                    }
                )

        logger.info(f' {i+1} URL has been finished from the total of {len(OBJECT_URLS)}')


    # reading the dict with pandas
    result_df = pd.json_normalize(
                scraped_data,
                record_path = ['reviews'],
                errors='ignore',
                meta=['object_name', 'object_address', 'overall_rating', 'review_num', 'object_url']
                )


    # reorder the columns


    # Saving the result into an excel file
    save_path = os.path.join(SAVING_PATH,'google_result.csv')
    result_df.to_csv(
        save_path,
        index= False
    )

    logger.info(f'Successfully exported the result file in the following folder: {os.path.join(SAVING_PATH,"google_result.csv")}')
    logger.info('Finished!')
if __name__ == '__main__':
    main()

KeyboardInterrupt: 