In [None]:
#USE THIS CODE TO BLOCK THE WEBSITE TO GOING TO HOME PAGE
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Create Chrome options
chrome_options = Options()
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})

# Initialize WebDriver with Chrome options
driver = webdriver.Chrome(options=chrome_options)

# Use Chrome DevTools Protocol (CDP) to block URLs
driver.execute_cdp_cmd(
    'Network.setBlockedURLs', {
        'urls': ['*clientlib-devtool.js']  # Block the devtools.js file
    }
)
#
# Load the webpage
driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")

In [None]:
#CODE USING PARALLELISM
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os,time

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.ID, "customCaptchaInput")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.ID, "customCaptchaInput")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.ID, "masterdata-search-box")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.CSS_SELECTOR, "td.companyname")
        if retry_click(driver, company_name_locator):
            # Handle CAPTCHA if necessary
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.CSS_SELECTOR, "button.tablinks.directorData")
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                if not df.empty:
                    all_data = pd.concat([all_data, df], ignore_index=True)
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(7).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    all_data = process_in_parallel(cin_values)

    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()
#PARALLELISATION , WORKING FINE(OKAYISH)

In [None]:
#CODE USING PARALLELISM WITH XPATH 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os, time

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    #chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))  # Updated XPath for CIN field
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")  # Updated XPath for company name
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")  # Updated XPath for Directory Data button
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                if not df.empty:
                    all_data = pd.concat([all_data, df], ignore_index=True)
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

# Main function
def main():
    input_file = "input_file.xlsx"
    cin_values = read_cin_from_excel(input_file)

    all_data = process_in_parallel(cin_values)

    output_file = "output.xlsx"
    all_data.to_excel(output_file, index=False)
    print(f"All data saved to {output_file}")

if __name__ == "__main__":
    main()
#X-PATHS CODE -BEST CODE UNTIL NOW

In [None]:
#TEST CODE FOR ONE CIN --- CODE FOR SAFETY SAVING
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os, time
from selenium.webdriver.common.action_chains import ActionChains
# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    #chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    """
    Attempts to click an element located by `by_locator` up to `max_attempts` times.
    """
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))  # Updated XPath for CIN field
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)
        '''company_name_locator = (By.XPATH,'//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]')  # Updated XPath for company name
        company_name_locator.click()'''
        company_name_element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//td[@class='companyname']"))
        )
        driver.execute_script("arguments[0].click();", company_name_element)
        if check_for_captcha(driver):
                handle_captcha(driver)
        '''if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)'''

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")  # Updated XPath for Directory Data button
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                if not df.empty:
                    all_data = pd.concat([all_data, df], ignore_index=True)
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

# Read CIN values from Excel file
def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

def main():
    # Example CIN for testing, you can replace this with any CIN value you want to test
    test_cin = "U28262TZ2024PTC032190"  # Replace this with a valid CIN for testing
    
    # Alternatively, read CIN from the Excel file (uncomment if needed)
    # input_file = "input_file.xlsx"
    # cin_values = read_cin_from_excel(input_file)
    # test_cin = cin_values[0]  # Take the first CIN for testing

    driver = setup_driver()
    df = search_master_data(driver, test_cin)

    if not df.empty:
        output_file = f"output_{test_cin}.xlsx"
        df.to_excel(output_file, index=False)
        print(f"Data for CIN {test_cin} saved to {output_file}")
    else:
        print(f"No data found for CIN {test_cin}")
    
    driver.quit()

if __name__ == "__main__":
    main()

In [None]:
'''/html/body/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
//*[@id="customCaptchaInput"]
//*[@id="fohomepage-037f88dfd8"]/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]
<td class="companyname"> LANZER TEXTILE SOLUTIONS PRIVATE LIMITED </td>'''

#X-PATHS-SAVING