In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time
import csv
import os
import sys

class MetaScraper:
    def __init__(self, chrome_driver_path=None, headless=False):
        """
        Initialize the scraper with browser settings
        
        Args:
            chrome_driver_path (str): Path to chromedriver
            headless (bool): Run browser in headless mode if True
        """
        self.options = Options()
        
        # Configure browser options
        if headless:
            self.options.add_argument("--headless=new")
            
        self.options.add_argument("--disable-gpu")
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-dev-shm-usage")
        self.options.add_argument("--window-size=1920,1080")
        
        # Set user agent
        self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
        
        # Disable images for faster loading
        self.options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
        
        # Setup the driver
        try:
            service = Service(executable_path=chrome_driver_path)
            self.driver = webdriver.Chrome(service=service, options=self.options)
            self.driver.set_page_load_timeout(30)
        except Exception as e:
            print(f"Error initializing Chrome: {str(e)}")
            raise
    
    def get_meta_data(self, url):
        """
        Scrape meta data from a given URL
        
        Args:
            url (str): Website URL to scrape
            
        Returns:
            dict: Dictionary containing the meta data
        """
        meta_data = {
            'url': url,
            'title': '',
            'meta_description': '',
            'meta_keywords': '',
            'status': 'success'
        }
        
        try:
            self.driver.get(url)
            # Add a small delay to ensure page loads properly
            time.sleep(2)
            
            # Get title
            meta_data['title'] = self.driver.title
            
            # Get meta description
            try:
                meta_description = self.driver.find_element(By.XPATH, "//meta[@name='description']")
                meta_data['meta_description'] = meta_description.get_attribute('content')
            except NoSuchElementException:
                meta_data['meta_description'] = 'Not found'
            
            # Get meta keywords
            try:
                meta_keywords = self.driver.find_element(By.XPATH, "//meta[@name='keywords']")
                meta_data['meta_keywords'] = meta_keywords.get_attribute('content')
            except NoSuchElementException:
                meta_data['meta_keywords'] = 'Not found'
            
        except TimeoutException:
            meta_data['status'] = 'timeout'
        except Exception as e:
            meta_data['status'] = f'error: {str(e)}'
        
        return meta_data
    
    def scrape_sites(self, urls, output_csv='meta_data_results.csv'):
        """
        Scrape multiple URLs and save results to CSV
        
        Args:
            urls (list): List of URLs to scrape
            output_csv (str): Output CSV filename
        """
        results = []
        total_urls = len(urls)
        
        for index, url in enumerate(urls):
            print(f"Scraping ({index+1}/{total_urls}): {url}")
            meta_data = self.get_meta_data(url)
            results.append(meta_data)
            
        # Save results to CSV
        with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['url', 'title', 'meta_description', 'meta_keywords', 'status']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(results)
            
        print(f"Scraping complete. Results saved to {output_csv}")
        
        # Convert to DataFrame for easy viewing
        df = pd.DataFrame(results)
        return df
    
    def load_urls_from_csv(self, csv_file):
        """
        Load URLs from a CSV file
        
        Args:
            csv_file (str): Path to CSV file containing URLs
            
        Returns:
            list: List of URLs from the CSV
        """
        urls = []
        try:
            with open(csv_file, 'r', encoding='utf-8') as file:
                # Try to determine if the CSV has headers and which column contains URLs
                sample = file.read(1024)
                file.seek(0)
                
                if ',' in sample:  # Likely a CSV with multiple columns
                    reader = csv.reader(file)
                    headers = next(reader, None)
                    
                    # Try to find a column that might contain URLs
                    url_col_idx = 0
                    if headers:
                        for i, header in enumerate(headers):
                            if header.lower() in ['url', 'link', 'website', 'site', 'address']:
                                url_col_idx = i
                                break
                    
                    for row in reader:
                        if row and len(row) > url_col_idx:
                            url = row[url_col_idx].strip()
                            if url and not url.startswith('#'):
                                if not url.startswith(('http://', 'https://')):
                                    url = 'https://' + url
                                urls.append(url)
                else:  # Likely a simple list of URLs, one per line
                    for line in file:
                        url = line.strip()
                        if url and not url.startswith('#'):
                            if not url.startswith(('http://', 'https://')):
                                url = 'https://' + url
                            urls.append(url)
            
            return urls
        except Exception as e:
            print(f"Error loading URLs from CSV: {str(e)}")
            return []
    
    def close(self):
        """Close the browser"""
        self.driver.quit()


# Main program
if __name__ == "__main__":
    print("=== Website Meta Data Scraper ===")
    
    # Set default ChromeDriver path
    driver_path = "/Users/rakesh/Downloads/chromedriver-mac-arm64/chromedriver"
    
    # Make sure chromedriver is executable on Mac
    if os.path.exists(driver_path) and sys.platform == 'darwin':  # macOS
        try:
            import stat
            os.chmod(driver_path, os.stat(driver_path).st_mode | stat.S_IXUSR)
        except Exception:
            pass
    
    # CSV file path - CHANGE THIS TO YOUR CSV FILE PATH
    csv_path = "/Users/rakesh/Downloads/cleaning_company.csv"  # <-- MODIFY THIS LINE with your actual CSV file path
    
    if not os.path.exists(csv_path):
        print(f"Error: File not found: {csv_path}")
        exit(1)
    
    # Initialize the scraper
    try:
        print("Starting Chrome...")
        scraper = MetaScraper(chrome_driver_path=driver_path)
        print("Chrome started successfully!")
    except Exception as e:
        print(f"Error starting Chrome: {str(e)}")
        exit(1)
    
    try:
        # Load URLs from CSV
        print(f"Loading URLs from {csv_path}...")
        websites = scraper.load_urls_from_csv(csv_path)
        
        if not websites:
            print("No valid URLs found in the CSV file.")
            exit(1)
        
        print(f"Loaded {len(websites)} URLs.")
        
        # Scrape the websites
        output_filename = "scraping_results.csv"
        print(f"\nScraping {len(websites)} URLs...")
        results_df = scraper.scrape_sites(websites, output_csv=output_filename)
        
        # Show completion message
        print(f"\nScraping complete! Results saved to {output_filename}")
    
    except KeyboardInterrupt:
        print("\nScraping stopped by user.")
    finally:
        # Clean up
        print("Closing Chrome...")
        scraper.close()

=== Website Meta Data Scraper ===
Starting Chrome...
Chrome started successfully!
Loading URLs from /Users/rakesh/Downloads/cleaning_company.csv...
Loaded 5 URLs.

Scraping 5 URLs...
Scraping (1/5): https://﻿https://dubaihousekeeping.com/
Scraping (2/5): https://elitemaids.ae/
Scraping (3/5): https://justmaid.ae/
Scraping (4/5): https://servicemarket.com/en/dubai/cleaning-maid-services
Scraping (5/5): https://dubaiclean.com/
Scraping complete. Results saved to scraping_results.csv

Scraping complete! Results saved to scraping_results.csv
Closing Chrome...
