In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--disable-javascript")
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

caps = DesiredCapabilities.CHROME
caps['goog:loggingPrefs'] = {'performance': 'ALL'}

driver = webdriver.Chrome(desired_capabilities=caps)

driver.execute_cdp_cmd(
    'Network.setBlockedURLs', {
        'urls': ['*devtools.js']  # Add the script you want to block
    }
)

driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")


TypeError: WebDriver.__init__() got an unexpected keyword argument 'desired_capabilities'

In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Create Chrome options
chrome_options = Options()
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})

# Initialize WebDriver with Chrome options
driver = webdriver.Chrome(options=chrome_options)

# Use Chrome DevTools Protocol (CDP) to block URLs
driver.execute_cdp_cmd(
    'Network.setBlockedURLs', {
        'urls': ['*clientlib-devtool.js']  # Block the devtools.js file
    }
)
#
# Load the webpage
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import google.generativeai as genai
import os

# Chrome setup with blocking unwanted URLs
chrome_options = Options()
chrome_options.add_argument("--headless")  # Headless mode
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

# CAPTCHA handling function
def handle_captcha():
    try:
        driver.implicitly_wait(10)
        time.sleep(5)

        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")

        time.sleep(5)
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to scrape director details
def scrape_director_details(uid):
    try:
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "directorTableHeadings")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df.to_excel(f"{uid}_director_details.xlsx", index=False)
        print(f"Director details saved for UID: {uid}")

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")

# Search and click company name
def search_master_data(uids):
    search_box = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
    for uid in uids:
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        company_names = driver.find_elements(By.CSS_SELECTOR, "td.companyname")
        if company_names:
            for company in company_names:
                company.click()
                time.sleep(3)
                director_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.tablinks.directorData")))
                director_button.click()
                time.sleep(3)
                scrape_director_details(uid)
        else:
            print(f"No company found for CIN: {uid}")

# Function to handle batches and avoid blocks
def handle_batches(uids):
    batch_size = 5
    wait_time = 10
    for i in range(0, len(uids), batch_size):
        batch = uids[i:i + batch_size]
        try:
            search_master_data(batch)
        except Exception as e:
            print(f"Blocked. Retrying after {wait_time} seconds. Error: {e}")
            time.sleep(wait_time)
            wait_time += 5
        time.sleep(10)

# Parallel processing of CIN batches
def process_in_parallel(cins):
    with ThreadPoolExecutor() as executor:
        batch_size = 5
        for i in range(0, len(cins), batch_size):
            batch = cins[i:i + batch_size]
            executor.submit(search_master_data, batch)
            time.sleep(10)

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)
    
    # Handle captcha first
    handle_captcha()
    
    # Process CINs in batches of 5
    process_in_parallel(cin_values)

if __name__ == "__main__":
    main()

CAPTCHA solved: 132


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import google.generativeai as genai
import os

# Chrome setup with blocking unwanted URLs
chrome_options = Options()
chrome_options.add_argument("--headless")  # Headless mode
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

# CAPTCHA handling function
def handle_captcha():
    try:
        driver.implicitly_wait(10)
        time.sleep(5)

        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")

        time.sleep(5)
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to scrape director details
def scrape_director_details(uid):
    try:
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "directorTableHeadings")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        # Convert data to a DataFrame and save to Excel
        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        output_file = f"{uid}_director_details.xlsx"
        df.to_excel(output_file, index=False)
        print(f"Director details saved for UID: {uid}")
        print(df)  # Print the data to the console

        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Search and click company name
def search_master_data(uids, all_data):
    search_box = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
    for uid in uids:
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        company_names = driver.find_elements(By.CSS_SELECTOR, "td.companyname")
        if company_names:
            for company in company_names:
                company.click()
                time.sleep(3)
                director_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.tablinks.directorData")))
                director_button.click()
                time.sleep(3)
                # Scrape director details and append to the all_data DataFrame
                df = scrape_director_details(uid)
                all_data = pd.concat([all_data, df], ignore_index=True)
                
                # Navigate back to the main URL after scraping
                driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
                time.sleep(3)
        else:
            print(f"No company found for CIN: {uid}")

    return all_data

# Function to handle batches and avoid blocks
def handle_batches(uids, all_data):
    batch_size = 5
    wait_time = 10
    for i in range(0, len(uids), batch_size):
        batch = uids[i:i + batch_size]
        try:
            all_data = search_master_data(batch, all_data)
        except Exception as e:
            print(f"Blocked. Retrying after {wait_time} seconds. Error: {e}")
            time.sleep(wait_time)
            wait_time += 5
        time.sleep(10)
    return all_data

# Parallel processing of CIN batches
def process_in_parallel(cins, all_data):
    with ThreadPoolExecutor() as executor:
        batch_size = 5
        for i in range(0, len(cins), batch_size):
            batch = cins[i:i + batch_size]
            executor.submit(search_master_data, batch, all_data)
            time.sleep(10)

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    # Initialize an empty DataFrame to store all the scraped data
    all_data = pd.DataFrame()

    # Handle captcha first
    handle_captcha()

    # Process CINs in batches of 5
    all_data = handle_batches(cin_values, all_data)

    # Save final output to an Excel file
    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()

#NO CAPTCHA MULTIPLE TIMES,JUST SAVING TO OUTPUT EXCEL SHEET 

In [25]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import google.generativeai as genai
import os

# Chrome setup with blocking unwanted URLs
chrome_options = Options()
#chrome_options.add_argument("--headless")  # Headless mode
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

# CAPTCHA handling function
def handle_captcha():
    try:
        driver.implicitly_wait(10)
        time.sleep(5)

        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")

        time.sleep(5)
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha():
    try:
        # Look for CAPTCHA input element on the page
        captcha_element = driver.find_element(By.ID, "customCaptchaInput")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha()
            return True
        return False
    except:
        # CAPTCHA not present
        return False

# Function to scrape director details
# Scrape director details after clicking Director/Signatory button
def scrape_director_details(uid):
    try:
        # Check for CAPTCHA again before scraping
        if check_for_captcha():
            handle_captcha()

        # Wait for the director data to load (assuming 'content' is the tbody for director data)
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        # Convert the scraped data to a DataFrame and save to Excel
        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        output_file = f"{uid}_director_details.xlsx"
        df.to_excel(output_file, index=False)
        print(f"Director details saved for UID: {uid}")
        print(df)  # Print the data to the console

        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Search and click company name, then scrape director details
def search_master_data(uids, all_data):
    search_box = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
    for uid in uids:
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        # Check if CAPTCHA appears after entering CIN
        if check_for_captcha():
            handle_captcha()
            # Re-enter CIN after solving CAPTCHA
            search_box.clear()
            search_box.send_keys(uid)
            search_box.send_keys(Keys.RETURN)
            time.sleep(5)

        company_names = driver.find_elements(By.CSS_SELECTOR, "td.companyname")
        for company in company_names:
                company.click()
                time.sleep(3)

                # Check for CAPTCHA after clicking the company name
                if check_for_captcha():
                    handle_captcha()
                    company.click()  # Re-click the company after solving CAPTCHA
                company.click() 
                if check_for_captcha():
                    handle_captcha()

                # Click the Director/Signatory button
                director_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.tablinks.directorData")))
                director_button.click()
                time.sleep(3)

                # Check for CAPTCHA after clicking Director/Signatory
                if check_for_captcha():
                    handle_captcha()
                    director_button.click()  # Re-click the director button after solving CAPTCHA

                # Scrape the director details and append to all_data DataFrame
                df = scrape_director_details(uid)
                all_data = pd.concat([all_data, df], ignore_index=True)

                # Navigate back to the main URL after scraping
                driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
                time.sleep(3)
        else:
            print(f"No company found for CIN: {uid}")

    return all_data

# Function to handle batches and avoid blocks
def handle_batches(uids, all_data):
    batch_size = 5
    wait_time = 10
    for i in range(0, len(uids), batch_size):
        batch = uids[i:i + batch_size]
        try:
            all_data = search_master_data(batch, all_data)
        except Exception as e:
            print(f"Blocked. Retrying after {wait_time} seconds. Error: {e}")
            time.sleep(wait_time)
            wait_time += 5
        time.sleep(10)
    return all_data

# Parallel processing of CIN batches
def process_in_parallel(cins, all_data):
    with ThreadPoolExecutor() as executor:
        batch_size = 5
        for i in range(0, len(cins), batch_size):
            batch = cins[i:i + batch_size]
            executor.submit(search_master_data, batch, all_data)
            time.sleep(10)

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    # Initialize an empty DataFrame to store all the scraped data
    all_data = pd.DataFrame()

    # Handle captcha first
    handle_captcha()

    # Process CINs in batches of 5
    all_data = handle_batches(cin_values, all_data)

    # Save final output to an Excel file
    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()

CAPTCHA solved: 131
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 127
Error handling CAPTCHA: Message: element not interactable
  (Session info: chrome=128.0.6613.137)
Stacktrace:
0   chromedriver                        0x000000010277d208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x000000010277566c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000102370670 cxxbridge1$string$len + 89156
3   chromedriver                        0x00000001023af758 cxxbridge1$string$len + 347436
4   chromedriver                        0x00000001023ae154 cxxbridge1$string$len + 341800
5   chromedriver                        0x00000001023ab064 cxxbridge1$string$len + 329272
6   chromedriver                        0x00000001023ee228 cxxbridge1$string$len + 604156
7   chromedriver                        0x00000001023a9698 cxxbridge1$string$len + 322668
8   chromedriver                        0x00000001023aa310 cxxbridge1$string$len + 325860
9   chro

In [None]:
<button class="tablinks directorData" onclick="tabAction(event, 'details')" style="font-size: 14px;"> Director/Signatory details </button>
<tbody id="content" style="text-align: center; font-size: 16px;"><tr style="font-size: 16px;"><td style="font-size: 14px;">1</td><td style="font-size: 14px;"><span class="redirect-Text" onclick="directorClickHandler(&quot;10728664&quot;,&quot;DIN&quot;)" style="font-size: 14px;">10728664</span></td><td style="font-size: 14px;">SWAPNA MADHURI  SENAPATHI</td><td style="font-size: 14px;">Director</td><td style="font-size: 14px;">02/08/2024</td><td style="font-size: 14px;">-</td><td style="font-size: 14px;">Yes</td></tr></tbody>

In [19]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import google.generativeai as genai
import os

# Chrome setup with blocking unwanted URLs
chrome_options = Options()
#chrome_options.add_argument("--headless")  # Headless mode
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

# CAPTCHA handling function
def handle_captcha():
    try:
        driver.implicitly_wait(10)
        time.sleep(5)

        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")

        time.sleep(5)
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha():
    try:
        # Look for CAPTCHA input element on the page
        captcha_element = driver.find_element(By.ID, "customCaptchaInput")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha()
            return True
        return False
    except:
        # CAPTCHA not present
        return False

# Function to scrape director details
# Scrape director details after clicking Director/Signatory button
def scrape_director_details(uid):
    try:
        # Check for CAPTCHA again before scraping
        if check_for_captcha():
            handle_captcha()

        # Wait for the director data to load (assuming 'content' is the tbody for director data)
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        # Convert the scraped data to a DataFrame and save to Excel
        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        output_file = f"{uid}_director_details.xlsx"
        df.to_excel(output_file, index=False)
        print(f"Director details saved for UID: {uid}")
        print(df)  # Print the data to the console

        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Search and click company name, then scrape director details
def search_master_data(uids, all_data):
    search_box = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
    for uid in uids:
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        # Check if CAPTCHA appears after entering CIN
        if check_for_captcha():
            handle_captcha()
            # Re-enter CIN after solving CAPTCHA
            search_box.clear()
            search_box.send_keys(uid)
            search_box.send_keys(Keys.RETURN)
            time.sleep(5)

        # Locate the table and find the company name using the class 'companyname'
        try:
            company_names = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "td.companyname")))
            
            # Click the first company name in the search result
            if company_names:
                print(f"Company found: {company_names[0].text}. Clicking on the company name...")
                company_names[0].click()
                time.sleep(3)

                # Check for CAPTCHA after clicking the company name
                if check_for_captcha():
                    handle_captcha()
                    company_names[0].click()  # Re-click the company after solving CAPTCHA

                # Click the Director/Signatory button
                director_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.tablinks.directorData")))
                director_button.click()
                time.sleep(3)

                # Check for CAPTCHA after clicking Director/Signatory
                if check_for_captcha():
                    handle_captcha()
                    director_button.click()  # Re-click the director button after solving CAPTCHA

                # Scrape the director details and append to all_data DataFrame
                df = scrape_director_details(uid)
                all_data = pd.concat([all_data, df], ignore_index=True)

                # Navigate back to the main URL after scraping
                driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
                time.sleep(3)
            else:
                print(f"No company found for CIN: {uid}")

        except Exception as e:
            print(f"Error while trying to click on company name for {uid}: {e}")

    return all_data

# Function to handle batches and avoid blocks
def handle_batches(uids, all_data):
    batch_size = 5
    wait_time = 10
    for i in range(0, len(uids), batch_size):
        batch = uids[i:i + batch_size]
        try:
            all_data = search_master_data(batch, all_data)
        except Exception as e:
            print(f"Blocked. Retrying after {wait_time} seconds. Error: {e}")
            time.sleep(wait_time)
            wait_time += 5
        time.sleep(10)
    return all_data

# Parallel processing of CIN batches
def process_in_parallel(cins, all_data):
    with ThreadPoolExecutor() as executor:
        batch_size = 5
        for i in range(0, len(cins), batch_size):
            batch = cins[i:i + batch_size]
            executor.submit(search_master_data, batch, all_data)
            time.sleep(10)

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    # Initialize an empty DataFrame to store all the scraped data
    all_data = pd.DataFrame()

    # Handle captcha first
    handle_captcha()

    # Process CINs in batches of 5
    all_data = handle_batches(cin_values, all_data)

    # Save final output to an Excel file
    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()

#THIS CODE ISNT CLICKING NO COMPANY NAME YAAR

CAPTCHA solved: 141
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 133
Error handling CAPTCHA: Message: element not interactable
  (Session info: chrome=128.0.6613.137)
Stacktrace:
0   chromedriver                        0x000000010045d208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x000000010045566c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000100050670 cxxbridge1$string$len + 89156
3   chromedriver                        0x000000010008f758 cxxbridge1$string$len + 347436
4   chromedriver                        0x000000010008e154 cxxbridge1$string$len + 341800
5   chromedriver                        0x000000010008b064 cxxbridge1$string$len + 329272
6   chromedriver                        0x00000001000ce228 cxxbridge1$string$len + 604156
7   chromedriver                        0x0000000100089698 cxxbridge1$string$len + 322668
8   chromedriver                        0x000000010008a310 cxxbridge1$string$len + 325860
9   chro

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import google.generativeai as genai
import os

# Chrome setup with blocking unwanted URLs
chrome_options = Options()
#chrome_options.add_argument("--headless")  # Headless mode
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

# CAPTCHA handling function
def handle_captcha(max_attempts=3):
    """
    Handles CAPTCHA by trying up to `max_attempts` to solve it.
    """
    try:
        for attempt in range(max_attempts):
            try:
                driver.implicitly_wait(10)
                time.sleep(5)

                screenshot = driver.get_screenshot_as_png()
                with open("full_page_screenshot.png", "wb") as file:
                    file.write(screenshot)

                image = Image.open("full_page_screenshot.png")
                width, height = image.size
                left, top = width * 0.50, height * 0.20
                right, bottom = left + width * 0.20, top + height * 0.28
                padding = 10
                region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
                region_image.save('captcha_regionS.png')

                genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
                def solve_captcha(image_path):
                    model = genai.GenerativeModel("gemini-1.5-flash")
                    image = Image.open(image_path)
                    response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
                    return response.text.strip()

                captcha_solution = solve_captcha("captcha_regionS.png")
                captcha_input = driver.find_element(By.ID, "customCaptchaInput")
                captcha_input.send_keys(captcha_solution)

                submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
                submit_button.click()
                print(f"CAPTCHA solved: {captcha_solution}")

                time.sleep(5)
                return True  # CAPTCHA solved
            except Exception as e:
                print(f"Attempt {attempt + 1}: Error handling CAPTCHA: {e}")
        return False  # Max attempts reached, CAPTCHA not solved
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")
        return False

def check_for_captcha():
    """
    Checks if CAPTCHA is present and solves it if necessary.
    Tries multiple times to confirm if CAPTCHA is gone.
    """
    for _ in range(3):
        try:
            captcha_element = driver.find_element(By.ID, "customCaptchaInput")
            if captcha_element.is_displayed():
                print("CAPTCHA detected. Solving CAPTCHA...")
                handle_captcha()
                return True
        except:
            # CAPTCHA not present
            return False
    return False

# Function to click company name after verifying the CAPTCHA is resolved
def click_company_name():
    attempts = 3  # Number of attempts to retry clicking the company name
    for attempt in range(attempts):
        try:
            company_names = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "td.companyname")))
            if company_names:
                print(f"Company found: {company_names[0].text}. Clicking on the company name...")
                company_names[0].click()
                time.sleep(3)

                if check_for_captcha():
                    handle_captcha()
                    company_names[0].click()  # Re-click if CAPTCHA appears after first click

                return True  # Click successful
            else:
                print("No company found in this search result.")
                return False  # No company name found
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking on company name: {e}")
    return False  # Failed after max attempts

# Search and click company name, then scrape director details
def search_master_data(uids, all_data):
    search_box = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
    for uid in uids:
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        # Check if CAPTCHA appears after entering CIN
        if check_for_captcha():
            handle_captcha()
            # Re-enter CIN after solving CAPTCHA
            search_box.clear()
            search_box.send_keys(uid)
            search_box.send_keys(Keys.RETURN)
            time.sleep(5)

        # Locate and click the company name
        if click_company_name():
            # Click the Director/Signatory button
            director_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.tablinks.directorData")))
            director_button.click()
            time.sleep(3)

            # Check for CAPTCHA after clicking Director/Signatory
            if check_for_captcha():
                handle_captcha()
                director_button.click()  # Re-click the director button after solving CAPTCHA

            # Scrape the director details and append to all_data DataFrame
            df = scrape_director_details(uid)
            all_data = pd.concat([all_data, df], ignore_index=True)

            # Navigate back to the main URL after scraping
            driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
            time.sleep(3)
        else:
            print(f"Failed to click on company name for CIN: {uid}")
    
    return all_data

# Function to handle batches and avoid blocks
def handle_batches(uids, all_data):
    batch_size = 5
    wait_time = 10
    for i in range(0, len(uids), batch_size):
        batch = uids[i:i + batch_size]
        try:
            all_data = search_master_data(batch, all_data)
        except Exception as e:
            print(f"Blocked. Retrying after {wait_time} seconds. Error: {e}")
            time.sleep(wait_time)
            wait_time += 5
        time.sleep(10)
    return all_data

# Parallel processing of CIN batches
def process_in_parallel(cins, all_data):
    with ThreadPoolExecutor() as executor:
        batch_size = 5
        for i in range(0, len(cins), batch_size):
            batch = cins[i:i + batch_size]
            executor.submit(search_master_data, batch, all_data)
            time.sleep(10)

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    # Initialize an empty DataFrame to store all the scraped data
    all_data = pd.DataFrame()

    # Handle captcha first
    handle_captcha()

    # Process CINs in batches of 5
    all_data = handle_batches(cin_values, all_data)

    # Save final output to an Excel file
    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()

CAPTCHA solved: 119
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 139
Attempt 1: Error handling CAPTCHA: Message: element not interactable
  (Session info: chrome=128.0.6613.137)
Stacktrace:
0   chromedriver                        0x0000000104b51208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x0000000104b4966c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000104744670 cxxbridge1$string$len + 89156
3   chromedriver                        0x0000000104783758 cxxbridge1$string$len + 347436
4   chromedriver                        0x0000000104782154 cxxbridge1$string$len + 341800
5   chromedriver                        0x000000010477f064 cxxbridge1$string$len + 329272
6   chromedriver                        0x00000001047c2228 cxxbridge1$string$len + 604156
7   chromedriver                        0x000000010477d698 cxxbridge1$string$len + 322668
8   chromedriver                        0x000000010477e310 cxxbridge1$string$len + 3258

KeyboardInterrupt: 

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import google.generativeai as genai
import os

# Chrome setup with blocking unwanted URLs
chrome_options = Options()
chrome_options.add_argument("--headless")  # Headless mode
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = webdriver.Chrome(options=chrome_options)
driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

# CAPTCHA handling function
def handle_captcha():
    try:
        driver.implicitly_wait(10)
        time.sleep(5)

        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")

        time.sleep(5)
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha():
    try:
        captcha_element = driver.find_element(By.ID, "customCaptchaInput")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha()
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(uid):
    try:
        if check_for_captcha():
            handle_captcha()

        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        output_file = f"{uid}_director_details.xlsx"
        df.to_excel(output_file, index=False)
        print(f"Director details saved for UID: {uid}")
        print(df)

        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def search_master_data(uids, all_data):
    search_box = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
    
    for uid in uids:
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        # Handle CAPTCHA if it appears
        if check_for_captcha():
            handle_captcha()
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        # Retry clicking the company name if necessary
        company_names = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "td.companyname")))
        print(f"Company found: {company_names[0].text}. Clicking on the company name...")
            
        if retry_click((By.CSS_SELECTOR, "td.companyname")):
            time.sleep(3)

                # Handle CAPTCHA again if it appears
        if check_for_captcha():
            handle_captcha()
        if retry_click((By.CSS_SELECTOR, "td.companyname")):
            time.sleep(3)

                # Click on the "Directory Data" button and handle CAPTCHA if needed
        director_button = (By.CSS_SELECTOR, "button.tablinks.directorData")
        if retry_click(director_button):
            time.sleep(3)

        if check_for_captcha():
            handle_captcha()
        if retry_click(director_button):
            time.sleep(3)

                    # Scrape the director details
        df = scrape_director_details(uid)
        print(df)
        all_data = pd.concat([all_data, df], ignore_index=True)

                    # Navigate back to the main URL
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        time.sleep(3)

    return all_data

def retry_click(by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
            time.sleep(2)
    return False

# Function to handle batches and avoid blocks
def handle_batches(uids, all_data):
    batch_size = 5
    wait_time = 10
    for i in range(0, len(uids), batch_size):
        batch = uids[i:i + batch_size]
        try:
            all_data = search_master_data(batch, all_data)
        except Exception as e:
            print(f"Blocked. Retrying after {wait_time} seconds. Error: {e}")
            time.sleep(wait_time)
            wait_time += 10
        time.sleep(10)
    return all_data

# Parallel processing of CIN batches
def process_in_parallel(cins, all_data):
    with ThreadPoolExecutor() as executor:
        batch_size = 5
        for i in range(0, len(cins), batch_size):
            batch = cins[i:i + batch_size]
            executor.submit(search_master_data, batch, all_data)
            time.sleep(10)

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    all_data = pd.DataFrame()

    handle_captcha()

    all_data = handle_batches(cin_values, all_data)

    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()
#CODE RUNS

KeyboardInterrupt: 

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(10)
        time.sleep(5)

        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")

        time.sleep(5)
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.ID, "customCaptchaInput")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)

        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        output_file = f"{uid}_director_details.xlsx"
        df.to_excel(output_file, index=False)
        print(f"Director details saved for UID: {uid}")
        print(df)

        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
            time.sleep(2)
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
        
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        time.sleep(5)

        # Handle CAPTCHA if it appears
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_names = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "td.companyname")))
        print(f"Company found: {company_names[0].text}. Clicking on the company name...")
            
        if retry_click(driver, (By.CSS_SELECTOR, "td.companyname")):
            time.sleep(3)

        # Handle CAPTCHA again if it appears
        if check_for_captcha(driver):
            handle_captcha(driver)
        if retry_click(driver, (By.CSS_SELECTOR, "td.companyname")):
            time.sleep(3)

        # Click on the "Directory Data" button and handle CAPTCHA if needed
        director_button = (By.CSS_SELECTOR, "button.tablinks.directorData")
        if retry_click(driver, director_button):
            time.sleep(3)

        if check_for_captcha(driver):
            handle_captcha(driver)
        if retry_click(driver, director_button):
            time.sleep(3)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        print(df)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                all_data = pd.concat([all_data, df], ignore_index=True)
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN']head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    all_data = process_in_parallel(cin_values)

    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()

CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 56
CAPTCHA solved: 91
CAPTCHA solved: 61
CAPTCHA solved: 183
CAPTCHA solved: 25
Error handling CAPTCHA: Message: element not interactable
  (Session info: chrome=128.0.6613.137)
Stacktrace:
0   chromedriver                        0x0000000104b79208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x0000000104b7166c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x000000010476c670 cxxbridge1$string$len + 89156
3   chromedriver                        0x00000001047ab758 cxxbridge1$string$len + 347436
4   chromedriver                        0x00000001047aa154 cxxbridge1$string$len + 341800
5   chromedriver                        0x00000001047a7064 cxxbridge1$string$len + 329272
6   chromedriver                        0x00000001047ea228 cxxbridge1$stri

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os,time

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.ID, "customCaptchaInput")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.CSS_SELECTOR, "td.companyname")
        if retry_click(driver, company_name_locator):
            # Handle CAPTCHA if necessary
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.CSS_SELECTOR, "button.tablinks.directorData")
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                if not df.empty:
                    all_data = pd.concat([all_data, df], ignore_index=True)
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    all_data = process_in_parallel(cin_values)

    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()
#PARALLELISATION , WORKING FINE(OKAYISH)

CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 130
CAPTCHA solved: 110
CAPTCHA solved: 54
CAPTCHA solved: 44
CAPTCHA solved: 35
Error handling CAPTCHA: Message: element not interactable
  (Session info: chrome=128.0.6613.138)
Stacktrace:
0   chromedriver                        0x00000001024f5208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x00000001024ed66c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x00000001020e8670 cxxbridge1$string$len + 89156
3   chromedriver                        0x0000000102127758 cxxbridge1$string$len + 347436
4   chromedriver                        0x0000000102126154 cxxbridge1$string$len + 341800
5   chromedriver                        0x0000000102123064 cxxbridge1$string$len + 329272
6   chromedriver                        0x0000000102166228 cxxbridge1$str

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os, time

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    #chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))  # Updated XPath for CIN field
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")  # Updated XPath for company name
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")  # Updated XPath for Directory Data button
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                if not df.empty:
                    all_data = pd.concat([all_data, df], ignore_index=True)
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    all_data = process_in_parallel(cin_values)

    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()
#X-PATHS CODE -BEST CODE UNTIL NOW

NameError: name 'all_data' is not defined

In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os, time
from selenium.webdriver.common.action_chains import ActionChains
# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    #chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))  # Updated XPath for CIN field
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)
        '''company_name_locator = (By.XPATH,'//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]')  # Updated XPath for company name
        company_name_locator.click()'''
        company_name_element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//td[@class='companyname']"))
        )
        driver.execute_script("arguments[0].click();", company_name_element)
        if check_for_captcha(driver):
                handle_captcha(driver)
        '''if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)'''

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")  # Updated XPath for Directory Data button
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                if not df.empty:
                    all_data = pd.concat([all_data, df], ignore_index=True)
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

def main():
    # Example CIN for testing, you can replace this with any CIN value you want to test
    test_cin = "U28262TZ2024PTC032190"  # Replace this with a valid CIN for testing
    
    # Alternatively, read CIN from the Excel file (uncomment if needed)
    # input_file = "input_file.xlsx"
    # cin_values = read_cin_from_excel(input_file)
    # test_cin = cin_values[0]  # Take the first CIN for testing

    driver = setup_driver()
    df = search_master_data(driver, test_cin)

    if not df.empty:
        output_file = f"output_{test_cin}.xlsx"
        df.to_excel(output_file, index=False)
        print(f"Data for CIN {test_cin} saved to {output_file}")
    else:
        print(f"No data found for CIN {test_cin}")
    
    driver.quit()

if __name__ == "__main__":
    main()

Error processing UID U28262TZ2024PTC032190: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=128.0.6613.138)
Stacktrace:
0   chromedriver                        0x00000001033a5208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x000000010339d66c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000102f98808 cxxbridge1$string$len + 89564
3   chromedriver                        0x0000000102f73b0c core::str::slice_error_fail::h6c488016ada29016 + 3776
4   chromedriver                        0x00000001030034d8 cxxbridge1$string$len + 527020
5   chromedriver                        0x0000000103015c90 cxxbridge1$string$len + 602724
6   chromedriver                        0x0000000102fd1698 cxxbridge1$string$len + 322668
7   chromedriver                        0x0000000102fd2310 cxxbridge1$string$len + 325860
8   chromedriver                        0x000000010336be78 cxxbridge1$str

In [None]:
/html/body/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
//*[@id="customCaptchaInput"]
//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
<td class="companyname"> LANZER TEXTILE SOLUTIONS PRIVATE LIMITED </td>