In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time

def scrape_socpa_contacts_direct():
    url = "https://socpa.org.sa/SOCPA/files/53/530b7b50-1633-474d-a3af-6a72157fc260.html"
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        print("Page loaded. Waiting for content...")
        time.sleep(5)
        
        # Find all visible tables
        tables = driver.find_elements(By.TAG_NAME, "table")
        print(f"Found {len(tables)} tables")
        
        # Lists to store our data
        phones = []
        emails = []
        
        # Approach 1: Look through each table for the specific columns
        for table_idx, table in enumerate(tables):
            print(f"Examining table {table_idx+1}...")
            
            # Try to find any cell containing phone and email headers
            phone_header_cells = table.find_elements(By.XPATH, ".//th[contains(text(), 'رقم الهاتف')] | .//td[contains(text(), 'رقم الهاتف')]")
            email_header_cells = table.find_elements(By.XPATH, ".//th[contains(text(), 'البريد الالكتروني')] | .//td[contains(text(), 'البريد الالكتروني')]")
            
            if phone_header_cells and email_header_cells:
                print(f"Found headers in table {table_idx+1}")
                
                # Get all rows
                rows = table.find_elements(By.TAG_NAME, "tr")
                
                # Skip header row(s)
                for row_idx in range(1, len(rows)):
                    cells = rows[row_idx].find_elements(By.TAG_NAME, "td")
                    
                    # Based on your screenshot, try to identify which cells contain phone and email
                    # This will depend on the exact structure of your table
                    if len(cells) >= 5:  # Assume we need at least 5 cells based on screenshot
                        # Try different indices based on the table structure we saw
                        possible_phone_indices = [3, 4, 0]  # Try these column indices for phone
                        possible_email_indices = [4, 5, 1]  # Try these column indices for email
                        
                        for ph_idx, em_idx in zip(possible_phone_indices, possible_email_indices):
                            if ph_idx < len(cells) and em_idx < len(cells):
                                phone_text = cells[ph_idx].text.strip()
                                email_text = cells[em_idx].text.strip()
                                
                                # Validate data
                                if phone_text and email_text and '@' in email_text:
                                    print(f"Found: {phone_text} - {email_text}")
                                    phones.append(phone_text)
                                    emails.append(email_text)
                                    break  # Found valid data, no need to try other indices
        
        # If the above approach didn't work, try a more direct method
        if not phones or not emails:
            print("Trying alternative approach...")
            
            # Look for spans containing digits (likely phone numbers)
            phone_spans = driver.find_elements(By.XPATH, "//span[string-length(text()) >= 6 and string-length(translate(text(), '0123456789', '')) = 0]")
            
            # Look for spans containing email addresses
            email_spans = driver.find_elements(By.XPATH, "//span[contains(text(), '@') and contains(text(), '.')]")
            
            print(f"Found {len(phone_spans)} potential phone spans and {len(email_spans)} potential email spans")
            
            # Extract text
            phones = [span.text.strip() for span in phone_spans if span.text.strip()]
            emails = [span.text.strip() for span in email_spans if span.text.strip()]
            
            # If we have unequal numbers, use the minimum length
            min_len = min(len(phones), len(emails))
            phones = phones[:min_len]
            emails = emails[:min_len]
        
        # Create data pairs
        contacts = []
        for i in range(len(phones)):
            contacts.append({"Phone": phones[i], "Email": emails[i]})
        
        return pd.DataFrame(contacts) if contacts else None
    
    except Exception as e:
        print(f"Error: {e}")
        return None
    
    finally:
        if 'driver' in locals():
            driver.quit()

# Run the function
print("Starting web scraping...")
contacts_df = scrape_socpa_contacts_direct()

if contacts_df is not None and not contacts_df.empty:
    print(f"\nExtracted {len(contacts_df)} contacts:")
    print(contacts_df)
    
    # Save to CSV
    contacts_df.to_csv('socpa_contacts_direct.csv', index=False, encoding='utf-8-sig')
    print(f"Data saved to socpa_contacts_direct.csv - {len(contacts_df)} records saved")
else:
    print("Failed to extract contact information")

Starting web scraping...
Page loaded. Waiting for content...
Found 172 tables
Examining table 1...
Examining table 2...
Examining table 3...
Examining table 4...
Examining table 5...
Examining table 6...
Examining table 7...
Examining table 8...
Examining table 9...
Examining table 10...
Examining table 11...
Examining table 12...
Examining table 13...
Examining table 14...
Examining table 15...
Examining table 16...
Examining table 17...
Examining table 18...
Examining table 19...
Examining table 20...
Examining table 21...
Examining table 22...
Examining table 23...
Examining table 24...
Examining table 25...
Examining table 26...
Examining table 27...
Examining table 28...
Examining table 29...
Examining table 30...
Examining table 31...
Examining table 32...
Examining table 33...
Examining table 34...
Examining table 35...
Examining table 36...
Examining table 37...
Examining table 38...
Examining table 39...
Examining table 40...
Examining table 41...
Examining table 42...
Examini

In [14]:
import pandas as pd

def rearrange_csv(input_file, output_file, delete_prefixes=None, priority_order=None):
    """
    Rearrange a CSV file based on custom rules for the 'Phone' column.
    
    Parameters:
    - input_file: Path to the input CSV file
    - output_file: Path to save the output CSV file
    - delete_prefixes: List of prefixes to remove (e.g., ['100'])
    - priority_order: List of prefixes in order of priority (e.g., ['5', '4', '2'])
    """
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Filter out numbers with specified prefixes
    if delete_prefixes:
        for prefix in delete_prefixes:
            df = df[~df['Phone'].astype(str).str.startswith(prefix)]
    
    # Create a column for sorting based on priority
    if priority_order:
        # Initialize with a large number (low priority)
        df['sort_priority'] = len(priority_order) + 1
        
        # Assign priorities based on the list
        for i, prefix in enumerate(priority_order):
            mask = df['Phone'].astype(str).str.startswith(prefix)
            df.loc[mask, 'sort_priority'] = i
        
        # Sort by priority
        df = df.sort_values(by='sort_priority')
        
        # Remove the temporary column
        df = df.drop(columns=['sort_priority'])
    
    # Reset the index
    df = df.reset_index(drop=True)
    
    # Save the modified data
    df.to_csv(output_file, index=False)
    
    return df

# Example usage:
if __name__ == "__main__":
    # Define your custom rules
    delete_prefixes = ['100']  # Remove phone numbers starting with '100'
    priority_order = ['5', '4', '2', '3']  # Priority order (top to bottom)
    
    # Run the function
    result = rearrange_csv(
        input_file='socpa_contacts_direct.csv',
        output_file='socpa_contacts_direct_rearranged.csv',
        delete_prefixes=delete_prefixes,
        priority_order=priority_order
    )
    
    print("CSV file has been rearranged successfully!")
    print(f"Removed {len(delete_prefixes)} types of numbers and prioritized {len(priority_order)} types.")

CSV file has been rearranged successfully!
Removed 1 types of numbers and prioritized 4 types.


In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

# Set up Chrome options
chrome_options = Options()
# Uncomment the line below if you want to run headless
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920,1080")

# Initialize the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

try:
    # Navigate to the URL
    url = "https://socpa.org.sa/SOCPA/files/53/530b7b50-1633-474d-a3af-6a72157fc260.html"
    print(f"Navigating to {url}")
    driver.get(url)
    
    # Wait for the page to load
    print("Waiting for page to load...")
    time.sleep(5)
    
    # Wait for tables to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )
    
    # Find all the rows in the table
    rows = driver.find_elements(By.TAG_NAME, "tr")
    print(f"Found {len(rows)} rows in total")
    
    # Create a list to store contact information
    contacts = []
    contact_count = 0
    max_contacts = 5  # Limit to 5 contacts

    # Look for rows that contain contact information
    for i, row in enumerate(rows):
        if contact_count >= max_contacts:
            break
            
        try:
            # Get the HTML of the row to check for phone numbers and emails
            row_html = row.get_attribute('innerHTML')
            
            # Skip rows without contact information
            if '@' not in row_html:
                continue
                
            # Find phone number span (numbers that are 9-10 digits)
            phone = ""
            phone_spans = row.find_elements(By.XPATH, ".//span[string-length(text()) >= 9 and string-length(text()) <= 10 and not(contains(text(), '@'))]")
            for span in phone_spans:
                span_text = span.text.strip()
                if span_text.isdigit():
                    phone = span_text
                    break
            
            # Find email address
            email = ""
            email_spans = row.find_elements(By.XPATH, ".//span[contains(text(), '@')]")
            for span in email_spans:
                span_text = span.text.strip()
                if '@' in span_text:
                    email = span_text
                    break
            
            # Find name (assuming it's in a specific column)
            # This may need adjustment based on the actual structure
            cells = row.find_elements(By.TAG_NAME, "td")
            name = cells[6].text.strip() if len(cells) > 6 else ""
            
            # Only add if we have at least an email
            if email:
                contact = {
                    "Name": name,
                    "Phone": phone,
                    "Email": email
                }
                contacts.append(contact)
                contact_count += 1
                print(f"Added contact #{contact_count}: {contact}")
        
        except Exception as e:
            print(f"Error processing row {i}: {e}")
            continue
    
    # If we didn't find enough contacts, try an alternative approach
    if contact_count < max_contacts:
        print("Trying alternative approach...")
        
        # Try to find email cells directly
        email_cells = driver.find_elements(By.XPATH, "//td[.//span[contains(text(), '@')]]")
        
        for cell in email_cells:
            if contact_count >= max_contacts:
                break
                
            try:
                # Get the email
                email_span = cell.find_element(By.XPATH, ".//span[contains(text(), '@')]")
                email = email_span.text.strip()
                
                # Find the parent row
                parent_row = cell.find_element(By.XPATH, "./ancestor::tr")
                
                # Find phone number in the same row
                phone = ""
                phone_spans = parent_row.find_elements(By.XPATH, ".//span[string-length(text()) >= 9 and string-length(text()) <= 10 and not(contains(text(), '@'))]")
                for span in phone_spans:
                    span_text = span.text.strip()
                    if span_text.isdigit():
                        phone = span_text
                        break
                
                # Find name
                cells = parent_row.find_elements(By.TAG_NAME, "td")
                name = cells[6].text.strip() if len(cells) > 6 else ""
                
                contact = {
                    "Name": name,
                    "Phone": phone,
                    "Email": email
                }
                
                # Check if this contact is already in our list
                if not any(c["Email"] == email for c in contacts):
                    contacts.append(contact)
                    contact_count += 1
                    print(f"Added contact #{contact_count} (alt): {contact}")
            
            except Exception as e:
                print(f"Error processing email cell: {e}")
    
    # Save to CSV
    if contacts:
        csv_file = "socpa_contacts_5.csv"
        with open(csv_file, 'w', newline='', encoding='utf-8') as file:
            fieldnames = ["Name", "Phone", "Email"]
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            
            writer.writeheader()
            for contact in contacts:
                writer.writerow(contact)
        
        print(f"\nSaved {len(contacts)} contacts to {csv_file}")
    else:
        print("No contacts found to save")
    
except Exception as e:
    print(f"An error occurred: {e}")
    
finally:
    print("Taking screenshot for debugging...")
    driver.save_screenshot("socpa_page.png")
    
    print("Closing browser...")
    driver.quit()

Navigating to https://socpa.org.sa/SOCPA/files/53/530b7b50-1633-474d-a3af-6a72157fc260.html
Waiting for page to load...
Found 1340 rows in total
Added contact #1: {'Name': '', 'Phone': '1009001508', 'Email': 'adeeb@abanumi-cpa.com'}
Added contact #2: {'Name': '', 'Phone': '1009001508', 'Email': 'adeeb@abanumi-cpa.com'}
Added contact #3: {'Name': '', 'Phone': '1009001508', 'Email': 'adeeb@abanumi-cpa.com'}
Added contact #4: {'Name': '11413', 'Phone': '1009001508', 'Email': 'adeeb@abanumi-cpa.com'}
Added contact #5: {'Name': '12281', 'Phone': '555692969', 'Email': 'info@ota.sa'}

Saved 5 contacts to socpa_contacts_5.csv
Taking screenshot for debugging...
Closing browser...
