In [2]:
import time
import random
import re
import os
import logging
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys


DRIVER_PATH = r'/home/oli/Projects/Google-review-scraper/chromedriver_linux64/chromedriver'
SAVING_PATH = '/Users/camille/repo/Hetic/projet_gouv/scraping/Data'

# declaring a list, that contains the urls wich we want to be scraped
OBJECT_URLS = "https://www.google.com/maps/"
    

# setting up the logging object
logger = logging.getLogger('main')
logging.basicConfig(
    format='[%(asctime)s] [%(levelname)s] - %(message)s',
    datefmt='%H:%M:%S'
    )

# we can change the logging level. Use logging.DEBUG if necesarry
logger.setLevel(logging.DEBUG)


def scrape_an_object(object_url, location):
    # setting the chrome driver for selenium
    driver = webdriver.Chrome(service=Service(DRIVER_PATH))

    # opening the given URL
    logger.debug("Opening the given URL")
    driver.get(object_url)

    # accepting the cookies
    logger.debug("Accepting the cookies")
    driver.find_element(By.CLASS_NAME,"lssxud").click()

    # waiting some random seconds
    time.sleep(random.uniform(4,6))
    select_box = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
    select_box.send_keys(location)
    select_box.send_keys(Keys.ENTER)
    time.sleep(2)
    object_name = driver.find_element(
    By.CSS_SELECTOR,
    'h1.DUwDvf.fontHeadlineLarge'
    ).text
    logger.debug(f'Object_name OK : {object_name}')

    object_address = driver.find_element(
        By.CSS_SELECTOR,
        'div.Io6YTe.fontBodyMedium'
    ).text
    logger.debug(f'Object_address OK : {object_address}')

    # I use CSS selectors where I can, because its more robust than XPATH



    try:

        overall_rating = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice.mmu3tf'
        ).text.split()[0]
        logger.debug(f'Overall_rating OK : {overall_rating}')

        review_number = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice.mmu3tf'
        ).text.replace(' ','')

        review_number = int(re.compile(r'\d+').findall(review_number)[-1])
        logger.debug(f'Review_number OK : {review_number}')

        # click to load further reviews
        driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]/span[2]/span[1]/span'
        ).click()

        logger.debug('Clicked to load further reviews')
    
        time.sleep(random.uniform(0.1, 0.5))

        # find scroll layout
        scrollable_div = driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
    

        logger.debug('Scroll div OK')


    except NoSuchElementException:

        logger.debug('Except branch')

        div_num_rating = driver.find_element(
            By.CSS_SELECTOR,
            'div.F7nice'
        ).text
        overall_rating = div_num_rating.split()[0]
        logger.debug(f'Overall_rating OK : {overall_rating}')

        review_number = int(div_num_rating.split()[1].replace('(','').replace(')',''))
        logger.debug(f'Review_number OK : {review_number}')

        # click on the review tab
        driver.find_element(By.XPATH,'/html/body/div[3]/div[9]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]/div[1]').click()
        logger.debug('clicked to load further reviews')

        time.sleep(random.uniform(0.1, 0.5))

        # find scroll layout
        scrollable_div = driver.find_element(
            By.XPATH,
            '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]'
        )
        logger.debug('Scroll div OK')
        #button = driver.element_to_be_clickable((By.CSS_SELECTOR, "button.w8nwRe"))
        #button.click()


    time.sleep(random.uniform(2,4))

    # scroll as many times as necessary to load all reviews
    for _ in range(0,(round(review_number/5 - 1)+1)):
        driver.execute_script(
            'arguments[0].scrollTop = arguments[0].scrollHeight',
            scrollable_div
        )
        # click on 'more' botton if it appears
        try:
            button = driver.find_element(
                By.CSS_SELECTOR,
                'button.w8nwRe.kyuRq'
            )
            button.click()
        except: 
            pass
        time.sleep(random.uniform(1, 2))

    # parse the html with a bs object
    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews_source = response.find_all('div', class_='jJc9Ad')
    logger.debug('Source code has been parsed!')

    # closing the browser
    
        
    

    # storing the data in a dict
    store_main_data = {'object_name': object_name,
                       'object_address': object_address,
                       'overall_rating': overall_rating,
                       'review_num': review_number,
                       'object_url':object_url}

    return store_main_data, reviews_source

def extract_reviews(reviews_source: list) -> list:

    r"""
    This method processes the input html code and returns a list 
    containing the reviews.

    """

    review_list = []

    logger.debug('Starting iterate trough the reviews...')
    for review in reviews_source:

        # extract the relevant informations
        #user = review.find('div', class_= 'd4r55').text.strip()
        date = review.find('span', class_= 'rsqaWe').text.strip()
        rate = len(review.find('span',class_ = 'kvMYJc'))
        review_text = review.find('span', class_= 'wiI7pd')
        review_text = '' if review_text is None else review_text.text 
        #eply_source = review.find('div', class_= 'CDe7pd')
       # reply = reply_source.text if reply_source else '-'


        review_list.append({#'name': user,
                            'date': date,
                            'rate': rate,
                            'review_text': review_text
                           # 'reply': reply
                            })

    return review_list




def main():
    # Read the locations from a CSV file
    locations_df = pd.read_csv('/Users/camille/repo/Hetic/projet_gouv/scraping/location.csv', sep=';')

    # Create an empty list to store all objects' data
    all_objects_data = []

    # Iterate through each location
    for _, row in locations_df.iterrows():
        location = row['location']
        
        store_main_data, reviews_source = scrape_an_object(OBJECT_URLS, location)
        store_main_data = extract_reviews(reviews_source)
        all_objects_data.extend(store_main_data)
        logger.debug(f'{location} is done!')

    # Create a dataframe from the list of dictionaries
    df = pd.DataFrame(all_objects_data, columns=['object_name', 'object_address', 'overall_rating', 'review_num', 'object_url'])

    # Save the dataframe to a single CSV file
    df.to_csv(os.path.join(SAVING_PATH, 'google_reviews.csv'), index=False)

    logger.debug('All locations are saved to a single CSV file!')



if __name__ == '__main__':
    main()

[07:30:26] [DEBUG] - Opening the given URL
[07:30:27] [DEBUG] - Accepting the cookies
[07:30:34] [DEBUG] - Object_name OK : Pôle emploi
[07:30:34] [DEBUG] - Object_address OK : 10 Rue Brancion, 75015 Paris
[07:30:34] [DEBUG] - Except branch
[07:30:34] [DEBUG] - Overall_rating OK : 2.8
[07:30:34] [DEBUG] - Review_number OK : 62
[07:30:34] [DEBUG] - clicked to load further reviews
[07:30:34] [DEBUG] - Scroll div OK
[07:30:58] [DEBUG] - Source code has been parsed!
[07:30:58] [DEBUG] - Starting iterate trough the reviews...
[07:30:58] [DEBUG] - pole emploi AVS Placement Artistes 10 RUE BRANCION 75015 Paris is done!
[07:30:58] [DEBUG] - All locations are saved to a single CSV file!


In [1]:
import time
import random
import re
import os
import logging
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys

# setting up the logging object
logger = logging.getLogger('main')
logging.basicConfig(
    format='[%(asctime)s] [%(levelname)s] - %(message)s',
    datefmt='%H:%M:%S'
)
# we can change the logging level. Use logging.DEBUG if necessary
logger.setLevel(logging.DEBUG)
DRIVER_PATH = '/Users/camille/repo/Hetic/projet_gouv/scraping/driver/chromedriver'
SAVING_PATH = '/Users/camille/repo/Hetic/projet_gouv/scraping/Data'
OBJECT_URLS = "https://www.google.com/maps/"

def scrape_an_object(object_url, location):
    driver = webdriver.Chrome(service=Service(DRIVER_PATH))
    driver.get(object_url)

    # accepting the cookies
    driver.find_element(By.CLASS_NAME, "lssxud").click()
    time.sleep(random.uniform(4, 6))
    select_box = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
    select_box.send_keys(location)
    select_box.send_keys(Keys.ENTER)
    time.sleep(2)

    object_name = driver.find_element(By.CSS_SELECTOR, 'h1.DUwDvf.fontHeadlineLarge').text
    object_address = driver.find_element(By.CSS_SELECTOR, 'div.Io6YTe.fontBodyMedium').text

    try:
        overall_rating = driver.find_element(By.CSS_SELECTOR, 'div.F7nice.mmu3tf').text.split()[0]
        review_number = int(re.compile(r'\d+').findall(driver.find_element(By.CSS_SELECTOR, 'div.F7nice.mmu3tf').text.replace(' ', ''))[-1])
        driver.find_element(By.XPATH, '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]/span[2]/span[1]/span').click()
    except NoSuchElementException:
        div_num_rating = driver.find_element(By.CSS_SELECTOR, 'div.F7nice').text
        overall_rating = div_num_rating.split()[0]
        review_number = int(div_num_rating.split()[1].replace('(', '').replace(')', ''))
        driver.find_element(By.XPATH, '/html/body/div[3]/div[9]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]/div[1]').click()

    time.sleep(random.uniform(0.1, 0.5))
    scrollable_div = driver.find_element(By.XPATH, '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')

    for _ in range(0, (round(review_number / 5 - 1) + 1)):
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
        try:
            button = driver.find_element(By.CSS_SELECTOR, 'button.w8nwRe.kyuRq')
            button.click()
        except:
            pass
        time.sleep(random.uniform(1, 2))

    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews_source = response.find_all('div', class_='jJc9Ad')

    driver.quit()

    store_main_data = {'object_name': object_name,
                        'object_address': object_address,
                        'overall_rating': overall_rating,
                        'review_num': review_number,
                        'object_url': object_url}

    return store_main_data, reviews_source


def extract_reviews(reviews_source: list) -> list:
    review_list = []

    for review in reviews_source:
        date = review.find('span', class_='rsqaWe').text.strip()
        rate = len(review.find('span', class_='kvMYJc'))
        review_text = review.find('span', class_='wiI7pd')
        review_text = '' if review_text is None else review_text.text

        review_list.append({'date': date,
                            'rate': rate,
                            'review_text': review_text})

    return review_list


def main():
    driver = webdriver.Chrome(service=Service(DRIVER_PATH))


    locations_df = pd.read_csv('/Users/camille/repo/Hetic/projet_gouv/scraping/location.csv', sep=';')

    for _, row in locations_df.iterrows():
            location = row['location']
            all_objects_data = []

            store_main_data, reviews_source = scrape_an_object(OBJECT_URLS, location)
            reviews_list = extract_reviews(reviews_source)
            store_main_data['reviews'] = reviews_list  # Add reviews to store_main_data
            all_objects_data.append(store_main_data)
        
            logger.debug(f'{location} is done!')

            df = pd.DataFrame(all_objects_data, columns=['object_name', 'object_address', 'overall_rating', 'review_num', 'object_url'])
            df.to_csv(os.path.join(SAVING_PATH, f'google_reviews.csv'), index=False)

            logger.debug(f'{location} is saved to CSV!')


if __name__ == '__main__':
    main()




[07:28:29] [DEBUG] - pole emploi AVS Placement Artistes 10 RUE BRANCION 75015 Paris is done!
[07:28:29] [DEBUG] - pole emploi AVS Placement Artistes 10 RUE BRANCION 75015 Paris is saved to CSV!
