In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, UnexpectedAlertPresentException

def setup_chrome_options(download_directory):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    prefs = {
        "download.default_directory": download_directory,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
        "safebrowsing.disable_download_protection": True
    }
    chrome_options.add_experimental_option("prefs", prefs)

    return chrome_options

def initialize_webdriver(chrome_options):
    return webdriver.Chrome(options=chrome_options)

def read_sequences(file_path):
    return pd.read_excel(file_path)

def process_sequence(driver, sequence, dna_type, wait):
    driver.get('http://www.scfbio-iitd.res.in/software/drugdesign/bdna.jsp')

    # Remove spaces from the sequence
    sequence = sequence.replace(" ", "")

    inbox_box = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/tbody/tr[2]/td/table[2]/tbody/tr/td/table/tbody/tr[1]/th/table/tbody/tr[2]/th/span/fieldset/div/form/p[1]/input[1]')))
    inbox_box.send_keys(sequence)

    if dna_type == 'B':
        dna_check = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main"]/tbody/tr[2]/td/table[2]/tbody/tr/td/table/tbody/tr[1]/th/table/tbody/tr[2]/th/span/fieldset/div/form/p[2]/span/input[2]')))
    else:  # A-DNA
        dna_check = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main"]/tbody/tr[2]/td/table[2]/tbody/tr/td/table/tbody/tr[1]/th/table/tbody/tr[2]/th/span/fieldset/div/form/p[2]/span/input[1]')))
    dna_check.click()

    submit_element = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main"]/tbody/tr[2]/td/table[2]/tbody/tr/td/table/tbody/tr[1]/th/table/tbody/tr[2]/th/span/fieldset/div/form/p[3]/input[1]')))
    submit_element.click()

    try:
        output_file = wait.until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, "outputfile")))
        output_file.click()
        return True
    except TimeoutException:
        print(f"Timeout waiting for output file link for sequence: {sequence}")
        return False

def wait_for_download(download_directory):
    time.sleep(10)  # Wait for the download to complete

def rename_file(download_directory, sequence_no):
    files = sorted(os.listdir(download_directory), key=lambda x: os.path.getmtime(os.path.join(download_directory, x)), reverse=True)
    if files:
        latest_file = os.path.join(download_directory, files[0])
        new_file_name = os.path.join(download_directory, f"{sequence_no}.pdb")
        os.rename(latest_file, new_file_name)

def process_dna_type(driver, sequences_df, download_directory, dna_type):
    wait = WebDriverWait(driver, 10)
    for index, row in sequences_df.iterrows():
        sequence_no = row['S.no']
        sequence = row['Sequence']

        try:
            success = process_sequence(driver, sequence, dna_type, wait)
            if success:
                wait_for_download(download_directory)
                rename_file(download_directory, f"{dna_type}_{sequence_no}")
            else:
                print(f"Failed to process {dna_type}-DNA for sequence number {sequence_no}")
        except UnexpectedAlertPresentException as e:
            print(f"Alert encountered for {dna_type}-DNA, sequence number {sequence_no}: {e.alert_text}")
        except Exception as e:
            print(f"Error processing {dna_type}-DNA for sequence number {sequence_no}: {str(e)}")

def main():
   # Set up paths
    excel_file_path = "/content/drive/MyDrive/weekend_project/random_sequences.xlsx"  # Update this path for your Colab environment
    a_dna_download_directory = "/content/drive/MyDrive/weekend_project/pdb_folder/A_DNA"  # Update this path for your Colab environment
    b_dna_download_directory = "/content/drive/MyDrive/weekend_project/pdb_folder/B_DNA"  # Update this path for your Colab environment

    # Ensure download directories exist
    os.makedirs(a_dna_download_directory, exist_ok=True)
    os.makedirs(b_dna_download_directory, exist_ok=True)

    # Read sequences
    sequences_df = read_sequences(excel_file_path)

    # Process A-DNA
    chrome_options_a = setup_chrome_options(a_dna_download_directory)
    driver_a = initialize_webdriver(chrome_options_a)
    process_dna_type(driver_a, sequences_df, a_dna_download_directory, 'A')
    driver_a.quit()

    # Process B-DNA
    chrome_options_b = setup_chrome_options(b_dna_download_directory)
    driver_b = initialize_webdriver(chrome_options_b)
    process_dna_type(driver_b, sequences_df, b_dna_download_directory, 'B')
    driver_b.quit()

if __name__ == "__main__":
    main()