In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.ttk import Progressbar
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins, progress_bar, status_label, batch_size=5):
    all_data = pd.DataFrame()
    total_cins = len(cins)
    for i in range(0, total_cins, batch_size):
        batch = cins[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch)} CINs.")
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in batch}
            for future in as_completed(futures):
                uid = futures[future]
                try:
                    df = future.result()
                    if not df.empty:
                        all_data = pd.concat([all_data, df], ignore_index=True)
                except Exception as e:
                    print(f"Error processing future for UID {uid}: {e}")
        progress = min(100, ((i + len(batch)) / total_cins) * 100)
        progress_bar['value'] = progress
        status_label.config(text=f"Processed {i + len(batch)} of {total_cins} CINs")
        window.update_idletasks()
    return all_data

def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

def process_excel(input_file, output_file, progress_bar, status_label):
    cin_values = read_cin_from_excel(input_file)
    all_data = process_in_parallel(cin_values, progress_bar, status_label)
    all_data.to_excel(output_file, index=False)
    messagebox.showinfo("Success", f"All data saved to {output_file}")

def start_scraping_thread(input_file, output_file, progress_bar, status_label):
    thread = threading.Thread(target=process_excel, args=(input_file, output_file, progress_bar, status_label))
    thread.start()

def select_input_file():
    file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    input_entry.delete(0, tk.END)
    input_entry.insert(0, file_path)

def select_output_folder():
    folder_path = filedialog.askdirectory()
    output_entry.delete(0, tk.END)
    output_entry.insert(0, folder_path)

def on_submit():
    input_file = input_entry.get()
    output_folder = output_entry.get()
    
    if not input_file or not output_folder:
        messagebox.showerror("Error", "Please select both input file and output folder.")
        return
    
    output_file = f"{output_folder}/output_directors.xlsx"
    start_scraping_thread(input_file, output_file, progress_bar, status_label)

window = tk.Tk()
window.title("CIN Data Scraper")
window.geometry("400x300")

input_label = tk.Label(window, text="Select Input Excel File:")
input_label.pack(pady=5)
input_entry = tk.Entry(window, width=40)
input_entry.pack(pady=5)
input_button = tk.Button(window, text="Browse", command=select_input_file)
input_button.pack(pady=5)

output_label = tk.Label(window, text="Select Output Folder:")
output_label.pack(pady=5)
output_entry = tk.Entry(window, width=40)
output_entry.pack(pady=5)
output_button = tk.Button(window, text="Browse", command=select_output_folder)
output_button.pack(pady=5)

submit_button = tk.Button(window, text="Submit", command=on_submit)
submit_button.pack(pady=10)

progress_bar = Progressbar(window, length=300, mode='determinate')
progress_bar.pack(pady=10)

status_label = tk.Label(window, text="")
status_label.pack(pady=5)

window.mainloop()
#CODE ISNT WORKING PROPERLY-CHECK IT!!(ONLY SOME OUTPUT-AND NOT GOING THROUGH ALL 20)



Processing batch 1 with 5 CINs.
Processing batch 1 with 5 CINs.
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 66
CAPTCHA solved: 33
CAPTCHA solved: 45
CAPTCHA solved: 116
CAPTCHA solved: 149
CAPTCHA solved: 79
CAPTCHA solved: 132
CAPTCHA solved: 109
Error handling CAPTCHA: Message: element not interactable
  (Session info: chrome=128.0.6613.138)
Stacktrace:
0   chromedriver                        0x0000000104c21208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x0000000104c1966c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000104814670 cxxbridge1$string$len + 89156
3   chromedriver                   

In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.ttk import Progressbar
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
from requests.exceptions import HTTPError
import requests

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

# Function to handle website blocking with exponential backoff
def wait_if_blocked(attempt):
    wait_time = min(3600, 10 * (2 ** attempt))  # Max wait time of 1 hour
    print(f"Blocked. Waiting for {wait_time} seconds...")
    time.sleep(wait_time)

def process_in_parallel(cins, progress_bar, status_label, batch_size=5):
    all_data = pd.DataFrame()
    total_cins = len(cins)
    attempt = 0
    
    for i in range(0, total_cins, batch_size):
        batch = cins[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch)} CINs.")
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in batch}
            for future in as_completed(futures):
                uid = futures[future]
                try:
                    df = future.result()
                    if not df.empty:
                        all_data = pd.concat([all_data, df], ignore_index=True)
                except HTTPError as e:
                    if e.response.status_code == 404:
                        print(f"Website blocked for UID {uid}.")
                        wait_if_blocked(attempt)
                        attempt += 1
                        continue
                    else:
                        print(f"HTTP error for UID {uid}: {e}")
                except Exception as e:
                    print(f"Error processing future for UID {uid}: {e}")
        progress = min(100, ((i + len(batch)) / total_cins) * 100)
        progress_bar['value'] = progress
        status_label.config(text=f"Processed {i + len(batch)} of {total_cins} CINs")
        window.update_idletasks()
    return all_data

def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

def process_excel(input_file, output_file, progress_bar, status_label):
    cin_values = read_cin_from_excel(input_file)
    all_data = process_in_parallel(cin_values, progress_bar, status_label)
    all_data.to_excel(output_file, index=False)
    messagebox.showinfo("Success", f"All data saved to {output_file}")
    window.quit()  # Close the window after successful data scraping

def start_scraping_thread(input_file, output_file, progress_bar, status_label):
    thread = threading.Thread(target=process_excel, args=(input_file, output_file, progress_bar, status_label))
    thread.start()

def select_input_file():
    file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    input_entry.delete(0, tk.END)
    input_entry.insert(0, file_path)

def select_output_folder():
    folder_path = filedialog.askdirectory()
    output_entry.delete(0, tk.END)
    output_entry.insert(0, folder_path)

def on_submit():
    input_file = input_entry.get()
    output_folder = output_entry.get()
    
    if not input_file or not output_folder:
        messagebox.showerror("Error", "Please select both input file and output folder.")
        return
    
    output_file = f"{output_folder}/output_directors.xlsx"
    start_scraping_thread(input_file, output_file, progress_bar, status_label)

window = tk.Tk()
window.title("CIN Data Scraper")
window.geometry("400x300")

input_label = tk.Label(window, text="Select Input Excel File:")
input_label.pack(pady=5)
input_entry = tk.Entry(window, width=40)
input_entry.pack(pady=5)
input_button = tk.Button(window, text="Browse", command=select_input_file)
input_button.pack(pady=5)

output_label = tk.Label(window, text="Select Output Folder:")
output_label.pack(pady=5)
output_entry = tk.Entry(window, width=40)
output_entry.pack(pady=5)
output_button = tk.Button(window, text="Browse", command=select_output_folder)
output_button.pack(pady=5)

submit_button = tk.Button(window, text="Submit", command=on_submit)
submit_button.pack(pady=10)

progress_bar = Progressbar(window, length=300, mode='determinate')
progress_bar.pack(pady=10)

status_label = tk.Label(window, text="")
status_label.pack(pady=5)
window.mainloop()
#WORKING BUT NOT FULLY SCRAPING



Processing batch 1 with 5 CINs.


: 

In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.ttk import Progressbar
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins, progress_bar, status_label, batch_size=5):
    all_data = pd.DataFrame()
    total_cins = len(cins)
    num_batches = (total_cins + batch_size - 1) // batch_size

    def process_batch(start_index, end_index):
        nonlocal all_data
        batch = cins[start_index:end_index]
        print(f"Processing batch {start_index // batch_size + 1} with {len(batch)} CINs.")
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in batch}
            for future in as_completed(futures):
                uid = futures[future]
                try:
                    df = future.result()
                    if not df.empty:
                        all_data = pd.concat([all_data, df], ignore_index=True)
                except Exception as e:
                    print(f"Error processing future for UID {uid}: {e}")
        progress = min(100, ((end_index / total_cins) * 100))
        progress_bar['value'] = progress
        status_label.config(text=f"Processed {end_index} of {total_cins} CINs")
        window.update_idletasks()

    # Process each batch in parallel
    batch_indices = [(i * batch_size, min((i + 1) * batch_size, total_cins)) for i in range(num_batches)]
    threads = [threading.Thread(target=process_batch, args=(start, end)) for start, end in batch_indices]
    for t in threads:
        t.start()
    for t in threads:
        t.join()

    return all_data

def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

def process_excel(input_file, output_file, progress_bar, status_label):
    cin_values = read_cin_from_excel(input_file)
    all_data = process_in_parallel(cin_values, progress_bar, status_label)
    all_data.to_excel(output_file, index=False)
    messagebox.showinfo("Success", f"All data saved to {output_file}")

def start_scraping_thread(input_file, output_file, progress_bar, status_label):
    thread = threading.Thread(target=process_excel, args=(input_file, output_file, progress_bar, status_label))
    thread.start()

def select_input_file():
    file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    input_entry.delete(0, tk.END)
    input_entry.insert(0, file_path)

def select_output_folder():
    folder_path = filedialog.askdirectory()
    output_entry.delete(0, tk.END)
    output_entry.insert(0, folder_path)
def on_submit():
    input_file = input_entry.get()
    output_folder = output_entry.get()
    
    if not input_file or not output_folder:
        messagebox.showerror("Error", "Please select both input file and output folder.")
        return
    
    output_file = f"{output_folder}/output_directors.xlsx"
    start_scraping_thread(input_file, output_file, progress_bar, status_label)

window = tk.Tk()
window.title("CIN Data Scraper")
window.geometry("400x300")

input_label = tk.Label(window, text="Select Input Excel File:")
input_label.pack(pady=5)
input_entry = tk.Entry(window, width=40)
input_entry.pack(pady=5)
input_button = tk.Button(window, text="Browse", command=select_input_file)
input_button.pack(pady=5)

output_label = tk.Label(window, text="Select Output Folder:")
output_label.pack(pady=5)
output_entry = tk.Entry(window, width=40)
output_entry.pack(pady=5)
output_button = tk.Button(window, text="Browse", command=select_output_folder)
output_button.pack(pady=5)

submit_button = tk.Button(window, text="Submit", command=on_submit)
submit_button.pack(pady=10)

progress_bar = Progressbar(window, length=300, mode='determinate')
progress_bar.pack(pady=10)

status_label = tk.Label(window, text="")
status_label.pack(pady=5)
window.mainloop()
##NOT WORKING



Processing batch 1 with 5 CINs.Processing batch 2 with 5 CINs.

Processing batch 3 with 5 CINs.
Processing batch 4 with 5 CINs.


Exception in thread Thread-7 (process_batch):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
Exception in thread Thread-5 (process_batch):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/x4/f0yhttf57678gq_cpd4czrfm0000gn/T/ipykernel_10340/3965966994.py", line 142, in process_batch
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/x4/f0yhttf57678gq_cpd4czrfm0000gn/T/ipykernel_10340/3965966994.py", line 142, in process_batch
  File "/var/folders/x4/f0yhttf5

: 

In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.ttk import Progressbar
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
from requests.exceptions import HTTPError
import requests
import os
# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function with retry functionalitydef handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")  # Updated XPath
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False


# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

# Function to handle website blocking with exponential backoff
def wait_if_blocked(attempt):
    wait_time = min(3600, 10 * (2 ** attempt))  # Max wait time of 1 hour
    print(f"Blocked. Waiting for {wait_time} seconds...")
    time.sleep(wait_time)

def process_in_parallel(cins, progress_bar, status_label, batch_size=5):
    all_data = pd.DataFrame()
    total_cins = len(cins)
    attempt = 0
    
    for i in range(0, total_cins, batch_size):
        batch = cins[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch)} CINs.")
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in batch}
            for future in as_completed(futures):
                uid = futures[future]
                try:
                    df = future.result()
                    if not df.empty:
                        all_data = pd.concat([all_data, df], ignore_index=True)
                except HTTPError as e:
                    if e.response.status_code == 404:
                        print(f"Website blocked for UID {uid}.")
                        wait_if_blocked(attempt)
                        attempt += 1
                        continue
                    else:
                        print(f"HTTP error for UID {uid}: {e}")
                except Exception as e:
                    print(f"Error processing future for UID {uid}: {e}")
        progress = min(100, ((i + len(batch)) / total_cins) * 100)
        progress_bar['value'] = progress
        status_label.config(text=f"Processed {i + len(batch)} of {total_cins} CINs")
        window.update_idletasks()
    return all_data

def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

def process_excel(input_file, output_file, progress_bar, status_label):
    cin_values = read_cin_from_excel(input_file)
    all_data = process_in_parallel(cin_values, progress_bar, status_label)
    all_data.to_excel(output_file, index=False)
    messagebox.showinfo("Success", f"All data saved to {output_file}")
    window.quit()  # Close the window after successful data scraping

def start_scraping_thread(input_file, output_file, progress_bar, status_label):
    thread = threading.Thread(target=process_excel, args=(input_file, output_file, progress_bar, status_label))
    thread.start()

def select_input_file():
    file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    input_entry.delete(0, tk.END)
    input_entry.insert(0, file_path)

def select_output_folder():
    folder_path = filedialog.askdirectory()
    output_entry.delete(0, tk.END)
    output_entry.insert(0, folder_path)

def on_submit():
    input_file = input_entry.get()
    output_folder = output_entry.get()
    
    if not input_file or not output_folder:
        messagebox.showerror("Error", "Please select both input file and output folder.")
        return
    
    output_file = f"{output_folder}/output_directors.xlsx"
    start_scraping_thread(input_file, output_file, progress_bar, status_label)

window = tk.Tk()
window.title("CIN Data Scraper")
window.geometry("400x300")

input_label = tk.Label(window, text="Select Input Excel File:")
input_label.pack(pady=5)
input_entry = tk.Entry(window, width=40)
input_entry.pack(pady=5)
input_button = tk.Button(window, text="Browse", command=select_input_file)
input_button.pack(pady=5)

output_label = tk.Label(window, text="Select Output Folder:")
output_label.pack(pady=5)
output_entry = tk.Entry(window, width=40)
output_entry.pack(pady=5)
output_button = tk.Button(window, text="Browse", command=select_output_folder)
output_button.pack(pady=5)

submit_button = tk.Button(window, text="Submit", command=on_submit)
submit_button.pack(pady=10)

progress_bar = Progressbar(window, length=300, mode='determinate')
progress_bar.pack(pady=10)

status_label = tk.Label(window, text="")
status_label.pack(pady=5)
window.mainloop()
#WORKING BUT NOT FULLY SCRAPING--TRY



Processing batch 1 with 5 CINs.
Error processing UID U47912TS2024PTC188120: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=128.0.6613.138)
Stacktrace:
0   chromedriver                        0x000000010485d208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x000000010485566c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000104450808 cxxbridge1$string$len + 89564
3   chromedriver                        0x0000000104449cf0 cxxbridge1$string$len + 62148
4   chromedriver                        0x000000010443bbd8 cxxbridge1$string$len + 4524
5   chromedriver                        0x000000010443d628 cxxbridge1$string$len + 11260
6   chromedriver                        0x000000010443bf8c cxxbridge1$string$len + 5472
7   chromedriver                        0x000000010443b800 cxxbridge1$string$len + 3540
8   chromedriver                        0x000000010443b74c cxxbridge1$string$len + 3360
9   chromedriver          

Exception in thread Thread-4 (process_excel):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/x4/f0yhttf57678gq_cpd4czrfm0000gn/T/ipykernel_9583/3595595276.py", line 180, in process_excel
  File "/var/folders/x4/f0yhttf57678gq_cpd4czrfm0000gn/T/ipykernel_9583/3595595276.py", line 151, in process_in_parallel
  File "/var/folders/x4/f0yhttf57678gq_cpd4czrfm0000gn/T/ipykernel_9583/3595595276.py", line 151, in <dictcomp>
  File "/var/folders/x4/f0yhttf57678gq_cpd4czrfm0000gn/T/ipykernel_9583/3595595276.py", line 24, in setup_driver
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/chrome/webdriver.py", line 45, in __init__
    super().__init__(

Error processing UID U86100PN2024NPL233244: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=128.0.6613.138)
Stacktrace:
0   chromedriver                        0x0000000100c1d208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x0000000100c1566c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000100810808 cxxbridge1$string$len + 89564
3   chromedriver                        0x0000000100809cf0 cxxbridge1$string$len + 62148
4   chromedriver                        0x00000001007fbbd8 cxxbridge1$string$len + 4524
5   chromedriver                        0x00000001007fd628 cxxbridge1$string$len + 11260
6   chromedriver                        0x00000001007fbf8c cxxbridge1$string$len + 5472
7   chromedriver                        0x00000001007fb800 cxxbridge1$string$len + 3540
8   chromedriver                        0x00000001007fb74c cxxbridge1$string$len + 3360
9   chromedriver                        0x00000001007f9ae8

: 

In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.ttk import Progressbar
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
from requests.exceptions import HTTPError
import requests
import os
# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Function to scrape director details
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

# Function to handle retrying clicks
def retry_click(driver, by_locator, max_attempts=3):
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        # Retry clicking the company name if necessary
        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        # Click on the "Directory Data" button
        director_button = (By.XPATH, "//*[@id='formId']/button[3]")
        retry_click(driver, director_button)

        # Scrape the director details
        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

# Function to handle website blocking with exponential backoff
def wait_if_blocked(attempt):
    wait_time = min(3600, 10 * (2 ** attempt))  # Max wait time of 1 hour
    print(f"Blocked. Waiting for {wait_time} seconds...")
    time.sleep(wait_time)

def process_in_parallel(cins, progress_bar, status_label, batch_size=5):
    all_data = pd.DataFrame()
    total_cins = len(cins)
    attempt = 0
    
    for i in range(0, total_cins, batch_size):
        batch = cins[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1} with {len(batch)} CINs.")
        with ThreadPoolExecutor(max_workers=batch_size) as executor:
            futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in batch}
            for future in as_completed(futures):
                uid = futures[future]
                try:
                    df = future.result()
                    if not df.empty:
                        all_data = pd.concat([all_data, df], ignore_index=True)
                except HTTPError as e:
                    if e.response.status_code == 404:
                        print(f"Website blocked for UID {uid}.")
                        wait_if_blocked(attempt)
                        attempt += 1
                        continue
                    else:
                        print(f"HTTP error for UID {uid}: {e}")
                except Exception as e:
                    print(f"Error processing future for UID {uid}: {e}")
        progress = min(100, ((i + len(batch)) / total_cins) * 100)
        progress_bar['value'] = progress
        status_label.config(text=f"Processed {i + len(batch)} of {total_cins} CINs")
        window.update_idletasks()
    return all_data

def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].head(20).tolist()

def process_excel(input_file, output_file, progress_bar, status_label):
    cin_values = read_cin_from_excel(input_file)
    all_data = process_in_parallel(cin_values, progress_bar, status_label)
    all_data.to_excel(output_file, index=False)
    messagebox.showinfo("Success", f"All data saved to {output_file}")
    window.quit()  # Close the window after successful data scraping

def start_scraping_thread(input_file, output_file, progress_bar, status_label):
    thread = threading.Thread(target=process_excel, args=(input_file, output_file, progress_bar, status_label))
    thread.start()

def select_input_file():
    file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    input_entry.delete(0, tk.END)
    input_entry.insert(0, file_path)

def select_output_folder():
    folder_path = filedialog.askdirectory()
    output_entry.delete(0, tk.END)
    output_entry.insert(0, folder_path)

def on_submit():
    input_file = input_entry.get()
    output_folder = output_entry.get()
    
    if not input_file or not output_folder:
        messagebox.showerror("Error", "Please select both input file and output folder.")
        return
    
    output_file = f"{output_folder}/output_directors.xlsx"
    start_scraping_thread(input_file, output_file, progress_bar, status_label)

window = tk.Tk()
window.title("CIN Data Scraper")
window.geometry("400x300")

input_label = tk.Label(window, text="Select Input Excel File:")
input_label.pack(pady=5)
input_entry = tk.Entry(window, width=40)
input_entry.pack(pady=5)
input_button = tk.Button(window, text="Browse", command=select_input_file)
input_button.pack(pady=5)

output_label = tk.Label(window, text="Select Output Folder:")
output_label.pack(pady=5)
output_entry = tk.Entry(window, width=40)
output_entry.pack(pady=5)
output_button = tk.Button(window, text="Browse", command=select_output_folder)
output_button.pack(pady=5)

submit_button = tk.Button(window, text="Submit", command=on_submit)
submit_button.pack(pady=10)

progress_bar = Progressbar(window, length=300, mode='determinate')
progress_bar.pack(pady=10)

status_label = tk.Label(window, text="")
status_label.pack(pady=5)
window.mainloop()
#WORKING BUT NOT FULLY SCRAPING



Processing batch 1 with 5 CINs.
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 132


: 

In [1]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.ttk import Progressbar
import threading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from PIL import Image
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import google.generativeai as genai
import os, time

# Chrome setup with blocking unwanted URLs
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Headless mode
    chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd('Network.setBlockedURLs', {'urls': ['*clientlib-devtool.js']})  # Block dev tools
    return driver

# CAPTCHA handling function (you can add your genai code if needed)
def handle_captcha(driver):
    try:
        driver.implicitly_wait(7)
        time.sleep(5)
        screenshot = driver.get_screenshot_as_png()
        with open("full_page_screenshot.png", "wb") as file:
            file.write(screenshot)

        image = Image.open("full_page_screenshot.png")
        width, height = image.size
        left, top = width * 0.50, height * 0.20
        right, bottom = left + width * 0.20, top + height * 0.28
        padding = 10
        region_image = image.crop((left - padding, top - padding, right + padding, bottom + padding))
        region_image.save('captcha_regionS.png')

        genai.configure(api_key="AIzaSyDCITTatIubdbdbHSlNWgXrNI-RghcogJc")
        def solve_captcha(image_path):
            model = genai.GenerativeModel("gemini-1.5-flash")
            image = Image.open(image_path)
            response = model.generate_content(["Perform the math operation in the picture and give output of the math operation only", image])
            return response.text.strip()

        captcha_solution = solve_captcha("captcha_regionS.png")
        captcha_input = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        captcha_input.send_keys(captcha_solution)
        
        submit_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "check")))
        submit_button.click()
        print(f"CAPTCHA solved: {captcha_solution}")
    except Exception as e:
        print(f"Error handling CAPTCHA: {e}")

# Function to check if CAPTCHA is present
def check_for_captcha(driver):
    try:
        captcha_element = driver.find_element(By.XPATH, "//*[@id='customCaptchaInput']")
        if captcha_element.is_displayed():
            print("CAPTCHA detected. Solving CAPTCHA...")
            handle_captcha(driver)
            return True
        return False
    except:
        return False

# Scraping functions
def scrape_director_details(driver, uid):
    try:
        if check_for_captcha(driver):
            handle_captcha(driver)
        time.sleep(3)
        table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "content")))
        rows = driver.find_elements(By.XPATH, "//tbody[@id='content']/tr")

        director_data = []
        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            director_data.append([col.text for col in columns])

        df = pd.DataFrame(director_data, columns=["Sr. No", "DIN/PAN", "Name", "Designation", "Date of Appointment", "Cessation Date", "Signatory"])
        df['CIN'] = uid
        return df

    except Exception as e:
        print(f"Error scraping director details for {uid}: {e}")
        return pd.DataFrame()

def retry_click(driver, by_locator, max_attempts=3):
    for attempt in range(max_attempts):
        try:
            element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(by_locator))
            element.click()
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking element: {e}")
    return False

def search_master_data(driver, uid):
    try:
        driver.get("https://www.mca.gov.in/content/mca/global/en/mca/master-data/MDS.html")
        search_box = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='masterdata-search-box']")))
        search_box.clear()
        search_box.send_keys(uid)
        search_box.send_keys(Keys.RETURN)
        if check_for_captcha(driver):
            handle_captcha(driver)

        company_name_locator = (By.XPATH, "//*[@id='fohomepage-037f88dfd8']/div/div/div[3]/div/div[1]/div[1]/div/table[1]/tbody/tr/td[2]")
        if retry_click(driver, company_name_locator):
            if check_for_captcha(driver):
                handle_captcha(driver)
            retry_click(driver, company_name_locator)

        director_button = (By.XPATH, "//*[@id='formId']/button[3]")
        retry_click(driver, director_button)

        df = scrape_director_details(driver, uid)
        return df

    except Exception as e:
        print(f"Error processing UID {uid}: {e}")
        return pd.DataFrame()

def process_in_parallel(cins, progress_bar, status_label):
    all_data = pd.DataFrame()
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(search_master_data, setup_driver(), uid): uid for uid in cins}
        for future in as_completed(futures):
            uid = futures[future]
            try:
                df = future.result()
                if not df.empty:
                    all_data = pd.concat([all_data, df], ignore_index=True)
                progress_bar['value'] += (100 / len(cins))
                status_label.config(text=f"Processed {uid}")
            except Exception as e:
                print(f"Error processing future for UID {uid}: {e}")
    return all_data

def read_cin_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df['CIN'].tolist()

# Function to start scraping
def process_excel(input_file, output_file, progress_bar, status_label):
    cin_values = read_cin_from_excel(input_file)
    all_data = process_in_parallel(cin_values, progress_bar, status_label)
    all_data.to_excel(output_file, index=False)
    messagebox.showinfo("Success", f"All data saved to {output_file}")
    window.quit()

# Thread to avoid blocking GUI
def start_scraping_thread(input_file, output_file, progress_bar, status_label):
    thread = threading.Thread(target=process_excel, args=(input_file, output_file, progress_bar, status_label))
    thread.start()

def select_input_file():
    file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx")])
    input_entry.delete(0, tk.END)
    input_entry.insert(0, file_path)

def select_output_folder():
    folder_path = filedialog.askdirectory()
    output_entry.delete(0, tk.END)
    output_entry.insert(0, folder_path)

def on_submit():
    input_file = input_entry.get()
    output_folder = output_entry.get()
    
    if not input_file or not output_folder:
        messagebox.showerror("Error", "Please select both input file and output folder.")
        return
    
    output_file = f"{output_folder}/output_directors.xlsx"
    progress_bar['value'] = 0  # Reset progress bar
    status_label.config(text="Starting...")
    start_scraping_thread(input_file, output_file, progress_bar, status_label)

# Tkinter GUI setup
window = tk.Tk()
window.title("CIN Data Scraper")
window.geometry("400x300")

input_label = tk.Label(window, text="Select Input Excel File:")
input_label.pack(pady=5)
input_entry = tk.Entry(window, width=40)
input_entry.pack(pady=5)
input_button = tk.Button(window, text="Browse", command=select_input_file)
input_button.pack(pady=5)

output_label = tk.Label(window, text="Select Output Folder:")
output_label.pack(pady=5)
output_entry = tk.Entry(window, width=40)
output_entry.pack(pady=5)
output_button = tk.Button(window, text="Browse", command=select_output_folder)
output_button.pack(pady=5)

submit_button = tk.Button(window, text="Submit", command=on_submit)
submit_button.pack(pady=10)

progress_bar = Progressbar(window, length=300, mode='determinate')
progress_bar.pack(pady=10)

status_label = tk.Label(window, text="")
status_label.pack(pady=5)

window.mainloop()
#CODE NOTS WORKING



CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA detected. Solving CAPTCHA...
CAPTCHA solved: 67
CAPTCHA solved: 107
CAPTCHA solved: 144
Error handling CAPTCHA: Message: element not interactable
  (Session info: chrome=128.0.6613.138)
Stacktrace:
0   chromedriver                        0x00000001032a5208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x000000010329d66c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000102e98670 cxxbridge1$string$len + 89156
3   chromedriver                        0x0000000102ed7758 cxxbridge1$string$len + 347436
4   chromedriver                        0x0000000102ed6154 cxxbridge1$string$len + 341800
5   chromedriver                        0x0000000102ed3064 cxxbridge1$string$len + 329272
6   chromedriver                        0x0000000102f16228 cxxbridge1$string$len + 604156
7   chromedriver     

: 