In [1]:
!pip install pandas selenium webdriver-manager pyarrow s3fs



In [None]:
import time
import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import os
import re
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

class EGATRealTimeScraper:
    def __init__(self, url="https://www.sothailand.com/sysgen/egat/"):
        self.url = url
        self._initialize_driver()

    def _initialize_driver(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.set_capability('goog:loggingPrefs', {'browser': 'ALL'})
        
        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=chrome_options)

    def extract_data_from_console(self):
        logs = self.driver.get_log('browser')
        
        for log_entry in reversed(logs):
            message = log_entry.get('message', '')
            if 'updateMessageArea:' in message:
                match = re.search(r'updateMessageArea:\s*(\d+)\s*,\s*(\d{1,2}:\d{2})\s*,\s*([\d,]+\.?\d*)\s*,\s*(\d+\.?\d*)', message)
                if match:
                    display_date_id = match.group(1).strip()
                    display_time = match.group(2).strip()
                    current_value_mw = float(match.group(3).replace(',', '').strip())
                    temperature_c = float(match.group(4).strip())

                    return {
                        'scrape_timestamp_utc': datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
                        'display_date_id': display_date_id,
                        'display_time': display_time,
                        'current_value_MW': current_value_mw,
                        'temperature_C': temperature_c
                    }
        return None

    def scrape_once(self):
        self.driver.get(self.url)
        time.sleep(10)
        return self.extract_data_from_console()

    def close(self):
        if self.driver:
            self.driver.quit()
            self.driver = None

def perform_scrape_and_update(scraper, lakefs_s3_path, storage_options):
    existing_df = pd.DataFrame()
    
    existing_df = pd.read_parquet(lakefs_s3_path, storage_options=storage_options)
    if 'scrape_timestamp_utc' in existing_df.columns:
        existing_df['scrape_timestamp_utc'] = pd.to_datetime(existing_df['scrape_timestamp_utc'])

    new_data_dict = scraper.scrape_once()

    if new_data_dict:
        new_df = pd.DataFrame([new_data_dict])
        new_df['scrape_timestamp_utc'] = pd.to_datetime(new_df['scrape_timestamp_utc'])

        combined_df = pd.concat([existing_df, new_df], ignore_index=True) if not existing_df.empty else new_df

        key_cols_for_dedup = ['display_date_id', 'display_time']
        combined_df.sort_values('scrape_timestamp_utc', ascending=False, inplace=True)
        deduplicated_df = combined_df.drop_duplicates(subset=key_cols_for_dedup, keep='first')

        if 'display_date_id' in deduplicated_df.columns and 'display_time' in deduplicated_df.columns:
            deduplicated_df['temp_datetime_sort'] = pd.to_datetime(
                deduplicated_df['display_date_id'] + ' ' + deduplicated_df['display_time'],
                format='%Y%m%d %H:%M'
            )
            deduplicated_df.sort_values(by=['temp_datetime_sort', 'scrape_timestamp_utc'], ascending=[True, True], inplace=True)
            deduplicated_df.drop(columns=['temp_datetime_sort'], inplace=True)
        elif 'scrape_timestamp_utc' in deduplicated_df.columns:
            deduplicated_df.sort_values('scrape_timestamp_utc', ascending=True, inplace=True)

        deduplicated_df.to_parquet(
            lakefs_s3_path,
            storage_options=storage_options,
            index=False,
            engine='pyarrow',
            compression='snappy'
        )

def run_scraper_periodically():
    ACCESS_KEY = os.getenv("LAKEFS_ACCESS_KEY_ID")
    SECRET_KEY = os.getenv("LAKEFS_SECRET_ACCESS_KEY")
    LAKEFS_ENDPOINT = os.getenv("LAKEFS_ENDPOINT_URL")
    REPO_NAME = os.getenv("REPO_NAME")
    BRANCH_NAME = os.getenv("BRANCH_NAME")
    TARGET_PARQUET_FILE_PATH = os.getenv("TARGET_PARQUET_FILE_PATH")
    INTERVAL_MINUTES = int(os.getenv("SCRAPE_INTERVAL_MINUTES", "5"))
    
    lakefs_s3_path = f"s3a://{REPO_NAME}/{BRANCH_NAME}/{TARGET_PARQUET_FILE_PATH}"

    storage_options = {
        "key": ACCESS_KEY,
        "secret": SECRET_KEY,
        "client_kwargs": {
            "endpoint_url": LAKEFS_ENDPOINT
        }
    }

    scraper = EGATRealTimeScraper()

    while True:
        perform_scrape_and_update(scraper, lakefs_s3_path, storage_options)
        time.sleep(INTERVAL_MINUTES * 60)

    scraper.close()

if __name__ == "__main__":
    run_scraper_periodically()

2025-05-18 13:08:32,235 - INFO - 835430248.py:208 - Configured LakeFS Target Path: s3a://dataset/main/egat_datascraping/egat_realtime_power_history.parquet
2025-05-18 13:08:32,236 - INFO - 835430248.py:209 - Scraping interval set to 5 minutes.
2025-05-18 13:08:32,237 - INFO - 835430248.py:27 - Initializing WebDriver...
2025-05-18 13:08:32,336 - INFO - logger.py:11 - Get LATEST chromedriver version for google-chrome
2025-05-18 13:08:32,482 - INFO - logger.py:11 - Get LATEST chromedriver version for google-chrome
2025-05-18 13:08:32,584 - INFO - logger.py:11 - There is no [linux64] chromedriver "136.0.7103.94" for browser google-chrome "136.0.7103" in cache
2025-05-18 13:08:32,585 - INFO - logger.py:11 - Get LATEST chromedriver version for google-chrome
2025-05-18 13:08:33,149 - INFO - logger.py:11 - WebDriver version 136.0.7103.94 selected
2025-05-18 13:08:33,155 - INFO - logger.py:11 - Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/136.0.7103.94/linux64/