In [1]:
# Import Libraries

import time
import os
import csv
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Create Google Map Scraper Class
class GoogleMapScraper:
    def __init__(self):
        self.output_file_name = "temp.csv"
        self.headless = False
        self.driver = None
        self.unique_check = []

    def config_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--lang=id")
        if self.headless:
            options.add_argument("--headless")
            options.add_argument('--ignore-ssl-errors=yes')
            options.add_argument('--ignore-certificate-errors')
        
        # Set a specific port for remote debugging
        options.add_argument("--remote-debugging-port=9222")
        
        # Create a new instance of the Chrome WebDriver
        self.driver = webdriver.Chrome(options=options)


    def load_companies(self, url):
        print("Getting business info", url)
        self.driver.get(url)
        time.sleep(5)
        panel_xpath = "//div[contains(@class, 'm6QErb') and contains(@class, 'DxyBCb') and contains(@class, 'kA9KIf') and contains(@class, 'dS8AEf')]"
        try:
            scrollable_div = self.driver.find_element(By.XPATH, panel_xpath)
        except NoSuchElementException:
            print("Scrollable panel not found.")
            return

        flag = True
        i = 0
        while flag:
            print(f"Scrolling to page {i + 2}")
            # Get the scroll height before scrolling
            last_height = self.driver.execute_script('return arguments[0].scrollHeight', scrollable_div)
            self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + 6500', scrollable_div)
            time.sleep(5)

            # Get the new scroll height after scrolling
            new_height = self.driver.execute_script('return arguments[0].scrollHeight', scrollable_div)
            
            # Check if the end of the list has been reached or if the scroll height has not changed
            if "You've reached the end of the list." in self.driver.page_source or new_height == last_height:
                flag = False

            self.get_business_info()
            i += 1

    def get_business_info(self):
        # Add a wait time before starting to scrape the business info
        WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'jJc9Ad')))
        
        try:
            for business in self.driver.find_elements(By.CLASS_NAME, 'jJc9Ad'):
                name = business.find_element(By.CLASS_NAME, 'd4r55').text
                try:
                    review = business.find_element(By.CLASS_NAME, 'wiI7pd').text
                except NoSuchElementException:
                    review = "No review found"
                # stars
                try:
                    stars_element = WebDriverWait(business, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, 'kvMYJc'))
                    )
                
                    rated_stars = stars_element.find_elements(By.CLASS_NAME, 'hCCjke.google-symbols.NhBTye.elGi1d')
                    non_rated_stars = stars_element.find_elements(By.CLASS_NAME, 'hCCjke.google-symbols.NhBTye.gnOR4e')
                
                    rating = len(rated_stars)
                    print(f'Rating: {rating} stars')
                except NoSuchElementException:
                    print("Stars element not found for this business.")

                # Date check
                date_elements = business.find_elements(By.CLASS_NAME, 'rsqaWe')
                if date_elements:
                    date_text = date_elements[0].text
                    print(f'Date text: {date_text}')
                    date_parts = date_text.split(", ")
                    if len(date_parts) > 1:
                        date = date_parts[1]
                    else:
                        date = date_text
                    print(date)
                else:
                    print('Date element not found.')

                unique_id = "".join([name, str(rating), review, date])
                if unique_id not in self.unique_check:
                    data = [name, rating, review, date]
                    self.save_data(data)
                    self.unique_check.append(unique_id)
                    print(unique_id)

        except NoSuchElementException as e:
            print(f"An error occurred: {e}")

    def save_data(self, data):
        header = ['ID', 'Name', 'Rating', 'Reviews', 'Date']
        file_exists = os.path.isfile(self.output_file_name)
        with open(self.output_file_name, 'a', newline='', encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            if not file_exists:
                writer.writerow(header)
            writer.writerow([len(self.unique_check)] + data)

In [3]:
# Read the CSV file
df = pd.read_csv('warehouse_list.csv')

# Initialize the dictionary
company_region_links = {}

# Iterate over the rows of the DataFrame to populate the dictionary
for i, row in df.iterrows():
    company = row['Company']
    region = row['Region']
    url = row['URL']
    filename = f'reviews/{company}_{region}.csv'

    # Check if the CSV file already exists
    if os.path.exists(filename):
        print(f"Skipping {company} in {region} as {filename} already exists.")
        continue
    
    if company not in company_region_links:
        company_region_links[company] = {}
    company_region_links[company][region] = url

# Initialize the GoogleMapScraper
business_scraper = GoogleMapScraper()
business_scraper.config_driver()

# Iterate over the companies and regions
for company, regions in company_region_links.items():
    for region, url in regions.items():
        print(f"Processing {company} in {region} with URL: {url}")
        
        # Load the company URL and scrape the data
        business_scraper.load_companies(url)
        
        # Perform the preprocessing steps after scraping
        df = pd.read_csv('temp.csv')
        df = df.drop(['ID'], axis=1)
        df = df.drop(['Name'], axis=1)
        df = df.drop(['Date'], axis=1)
        df['Company'] = company
        df['Province'] = region
        df.to_csv(f'reviews/{company}_{region}.csv', index=False)

        print(f"Data for {company} in {region} saved to {company}_{region}.csv")
        os.remove('temp.csv')


Skipping JNE in Aceh as reviews/JNE_Aceh.csv already exists.
Skipping JNE in Sumatera Utara as reviews/JNE_Sumatera Utara.csv already exists.
Skipping JNE in Sumatera Barat as reviews/JNE_Sumatera Barat.csv already exists.
Skipping JNE in Riau as reviews/JNE_Riau.csv already exists.
Skipping JNE in Jambi as reviews/JNE_Jambi.csv already exists.
Skipping JNE in Bengkulu as reviews/JNE_Bengkulu.csv already exists.
Skipping JNE in Lampung as reviews/JNE_Lampung.csv already exists.
Skipping JNE in Sumatera Selatan as reviews/JNE_Sumatera Selatan.csv already exists.
Skipping JNE in Bangka Belitung as reviews/JNE_Bangka Belitung.csv already exists.
Skipping JNE in Kepulauan Riau as reviews/JNE_Kepulauan Riau.csv already exists.
Skipping JNE in Banten as reviews/JNE_Banten.csv already exists.
Skipping JNE in DKI Jakarta as reviews/JNE_DKI Jakarta.csv already exists.
Skipping JNE in Jawa Barat as reviews/JNE_Jawa Barat.csv already exists.
Skipping JNE in Jawa Tengah as reviews/JNE_Jawa Tengah.