In [None]:
# Parallel notebook to addgene replication extraction blast removing features allison told to do so

# to extact from uniprot -> molecular function and biological process for cds of protein sequences obtained after blastx search. 
# In blastx search for the cds we take the top ten search result desciption, for their corresponding protein sequence get the information mentioned above from uniprot


In [2]:
import numpy as np
import pandas as pd
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from IPython.display import clear_output
import pickle
import requests

In [2]:
full_df = pd.read_csv("/scratch/alopatki_lab/Sharma/summer_project/db_wo_blast_uniprot.csv")


In [None]:
# selecting all cds features from full_df
all_cds = {}
for i in range(full_df.shape[0]):
    print(f"Currently processing row: {i+1}/{full_df.shape[0]}")
    curr_row = full_df.iloc[i]
    if curr_row['Feature Type'] == 'CDS':
        if curr_row["Feature Label"] not in all_cds.keys():
            all_cds[curr_row["Feature Label"]] = [curr_row["Feature Sequence"]]
        else:
            all_cds[curr_row["Feature Label"]].append(curr_row["Feature Sequence"])
    clear_output(wait = True)

In [None]:


# Open a file in binary write mode
with open("/scratch/alopatki_lab/Sharma/summer_project/temp_files/cds_dict.pkl", "wb") as file:
    # Pickle the dictionary and write it to the file
    pickle.dump(all_cds, file)


In [None]:
all_keys = list(all_cds.keys())
# printing keys with more than one sequences
counter = 0
for key in all_keys:
    val_len = len(all_cds[key])
    if val_len > 1:
        counter += 1
print(all_cds[all_keys[0]])

# Surprisingly 327 feature label have more than one feature sequences (Need to get through Allison)

In [None]:
print(all_keys)

In [None]:
# Removing all the color keyword cds features I could find
color_keywords = ["Red", "Green", "Blue", "Yellow", "Orange", "Cyan", "Turquoise", "Gold", 
                  "Scarlet", "Crimson", "Cerulean", "Wasabi", "Clover", "Plum", "Citrine", 
                  "FP", "mCherry", "mKate", "mRuby", "mCardinal", "mTagBFP", "mScarlet", 
                  "mEos", "YPet", "Venus", "mNeonGreen", "mTurquoise", "mApple", "pHRed",
                  "mGold", "LSSmOrange", "iRFP", "E2-Crimson", "CyPet", "KillerOrange", 
                  "Dronpa-Green", "mEGFP", "Dendra", "mStrawberry", "SYFP", "Kusabira-Orange",
                  "cp173Venus", "NirFP", "TurboRFP", "dTomato", "mOrange", "GCaMP", 
                  "mTFP", "TagBFP", "mCyRFP", "miRFP", "daGFP", "AmCyan", "Kaede", "EosFP",
                  "mVenus", "QuasAr", "Topaz YFP", "CyRFP", "Peredox", "Azurite BFP", 
                  "CopGFP", "mEos4a", "miRFP709", "mKalama1", "mKikGR1", "msGFP2",
                  "DsRed-Monomer", "miRFP670", "PAmKate", "iRFP702", "superecliptic pHluorin", 
                  "mOrange2", "mCarmine", "HcRed", "hGLuc", "mAmetrine"]

for name in all_keys:
  for keyword in color_keywords:
    if keyword in name:
      print(f"Removing feature label: {name}")  
      all_keys.remove(name)
      break 

In [None]:
# Now we will make a list of all the features Allison suggested to remove and if they are present in all_keys list we will remove them
features_to_remove_df = pd.read_excel("/scratch/alopatki_lab/Sharma/summer_project/db_wo_blast_uniprot_AJLnotes.xlsm", sheet_name = 1)


In [None]:
features_to_remove = []
for i in range(features_to_remove_df.shape[0]):
    curr_row = features_to_remove_df.iloc[i]
    if curr_row["to_remove"] == 'remove':
        features_to_remove.append(curr_row["Feature Label"])

In [None]:
for feature in features_to_remove:
    if feature in all_keys:
        print(f"Feature removed: {feature}")
        all_keys.remove(feature)

In [None]:
# Keeping only the required feature labels in the all_cds dict then, pickling it for future reference
cds_to_keep = {}
for key in all_keys:
    cds_to_keep[key] = all_cds[key]


with open("/scratch/alopatki_lab/Sharma/summer_project/temp_files/cds_dict_cleaned_v1.pkl", "wb") as file:
    # Pickle the dictionary and write it to the file
    pickle.dump(cds_to_keep, file)

# Final cds labels on which we'll run the blast -> only 276 such features are present (v1), 223 of them have more than one sequence in the dataset


In [None]:
# Now we will make a code that will query a gene name on uniprot, we get the go annotations for the entry which contain
# the feaure label as a gene name in the entry. If that is not the case we return an empty result in the hopes that the blast result
# will be more dependable


In [None]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
import time as time

def search_uniprot_gene(gene_name):
    """
    Searches for a gene name on UniProt, scrolls through results,
    and extracts the 'Gene:' value from the top 10 results. 
    If the 'Gene:' value contains the search gene_name, clicks on the 
    corresponding link within the card.

    Args:
        gene_name (str): The name of the gene to search for.

    Returns:
        A list of dictionaries, each containing the 'Gene:' value and a boolean
        indicating whether the link was clicked for the top 10 results,
        or None if an error occurs.
    """

    options = Options()
    # options.add_argument("--headless")  # Uncomment for background

    driver = webdriver.Firefox(options=options)

    try:
        driver.get("https://www.uniprot.org/")

        search_box = driver.find_element(By.CSS_SELECTOR, "div.main-search__input-container > input[type='search']")
        search_box.send_keys(gene_name)

        search_button = driver.find_element(By.CSS_SELECTOR, "button.button.primary[type='submit']")
        search_button.click()

        # Handle the view selection prompt
        try:
            view_selection_prompt = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//div[contains(p, 'Select how you would like to view your results')]"))
            )
            table_radio = view_selection_prompt.find_element(By.XPATH, ".//input[@type='radio']")
            table_radio.click()
            view_results_button = view_selection_prompt.find_element(By.XPATH, ".//button[text()='View results']")
            view_results_button.click()
        except:
            pass

        # Wait for the specific <ul> element containing the results
        result_list = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.data-list.no-bullet.hotjar-margin.Anr5j"))
        )

        # Extract gene names and click links if applicable
        # gene_results = []
        cards = result_list.find_elements(By.CSS_SELECTOR, "section.card")
        for card in cards[:10]:  # Process only the top 10 cards
            try:
                gene_parent = card.find_element(By.XPATH, ".//div[@class='card__content']//strong[text()='Gene:']/parent::div")
                gene_text = gene_parent.text
                gene_value = (gene_text.split("Gene:")[1].strip()).split('·')[0]

                if gene_name.lower() in gene_value.lower():
                    link = card.find_element(By.CSS_SELECTOR, "h2.small a")
                    link.click()

                    # Scroll to GO annotations section
                    go_header = WebDriverWait(driver, 20).until(
                        EC.presence_of_element_located((By.XPATH, "//h3[@data-article-id='gene_ontology']"))
                    )
                    driver.execute_script("arguments[0].scrollIntoView();", go_header)

                    # Extract GO annotations table
                    try:
                        table_wrapper = WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, "//h3[@data-article-id='gene_ontology']/following-sibling::div[1]"))
                        )
                        table = table_wrapper.find_element(By.TAG_NAME, "table")  # Find table within the wrapper
                        print(table)
                        return extract_table_data(table)  # Return the table dictionary directly
                    except:
                        print("GO annotations table not found or error extracting data.")
                        return None  # Return None if table not found or extraction error

            except Exception as e:
                print(f"Error processing card: {e}")

        return None  # Return None if no matching gene is found

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    finally:
        driver.quit() 

def extract_table_data(table):
    """
    Extracts data from the GO annotations table element.

    Args:
        table (WebElement): The table element to extract data from.

    Returns:
        A dictionary representing the table data, or None if an error occurs.
    """
    try:
        table_data = {}
        rows = table.find_elements(By.XPATH, ".//tr[not(@class='pd-group-header') and not(@class='pd-group-footer')]")  # Select all data rows

        for row in rows:
            aspect_element = row.find_element(By.XPATH, ".//td[2]")
            aspect = aspect_element.text.strip()

            term_element = row.find_element(By.XPATH, ".//td[3]/a")
            term = term_element.text.strip()

            if aspect in table_data:
                table_data[aspect] += ", " + term
            else:
                table_data[aspect] = term

        return table_data

    except Exception as e:
        print(f"Error extracting table data: {e}")
        return None




# Example usage:
gene_results = search_uniprot_gene("trfA")
print(gene_results)


In [3]:

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
import time as time

def search_uniprot_gene(gene_name):
    """
    Searches for a gene name on UniProt, scrolls through results,
    and extracts the 'Gene:' value and UniProt ID from the top 10 results.
    If the 'Gene:' value contains the search gene_name, it fetches the 
    GO annotations using the UniProt ID and the QuickGO API. 

    Args:
        gene_name (str): The name of the gene to search for.

    Returns:
        A dictionary containing the UniProt ID and its corresponding GO 
        annotations, or None if an error occurs or the gene is not found.
    """

    options = Options()
    # options.add_argument("--headless")  # Uncomment for background

    driver = webdriver.Firefox(options=options)



    try:
        driver.get("https://www.uniprot.org/")

        search_box = driver.find_element(By.CSS_SELECTOR, "div.main-search__input-container > input[type='search']")
        search_box.send_keys(gene_name)

        search_button = driver.find_element(By.CSS_SELECTOR, "button.button.primary[type='submit']")
        search_button.click()

        # Handle the view selection prompt
        try:
            view_selection_prompt = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//div[contains(p, 'Select how you would like to view your results')]"))
            )
            table_radio = view_selection_prompt.find_element(By.XPATH, ".//input[@type='radio']")
            table_radio.click()
            view_results_button = view_selection_prompt.find_element(By.XPATH, ".//button[text()='View results']")
            view_results_button.click()
        except:
            pass

        # Wait for the specific <ul> element containing the results
        result_list = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.data-list.no-bullet.hotjar-margin.Anr5j"))
        )

        cards = result_list.find_elements(By.CSS_SELECTOR, "section.card")
        for card in cards[:10]:
            try:
                gene_parent = card.find_element(By.XPATH, ".//div[@class='card__content']//strong[text()='Gene:']/parent::div")
                gene_text = gene_parent.text
                gene_value = (gene_text.split("Gene:")[1].strip()).split('·')[0]

                if gene_name.lower() in gene_value.lower():
                    link = card.find_element(By.CSS_SELECTOR, "h2.small a")
                    uniprot_id = link.get_attribute("href").split("/")[-1]  # Extract UniProt ID
                    link.click()
                    # print(uniprot_id)
                    # ... (No need to scroll to GO annotations section) ...

                    go_annotations = get_go_annotations(uniprot_id)  # Fetch GO annotations
                    # driver.back()  # Go back to results page
                    return {uniprot_id: go_annotations}  # Return ID and annotations

            except Exception as e:
                print(f"Error processing card: {e}")
                return None

        return None  # Return None if no matching gene is found

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    finally:
        driver.quit()


def get_go_annotations(uniprot_id):
    """Fetches GO annotations from QuickGO API and extracts relevant data, 
    including GO term names.
    """

    url = f"https://www.ebi.ac.uk/QuickGO/services/annotation/search?geneProductId={uniprot_id}"
    response = requests.get(url, headers={"Accept": "application/json"})

    if response.ok:
        data = response.json()
        go_terms = []

        for entry in data['results']:
            go_id = entry['goId']
            go_name = get_go_term_name(go_id)  # Get the GO term name
            go_terms.append({
                'goId': go_id,
                'goAspect': entry['goAspect'],
                'goName': go_name
            })

        return go_terms
    else:
        print(f"Error fetching data from QuickGO: {response.status_code}")
        return None


def get_go_term_name(go_id):
    """Fetches the GO term name for a given GO ID."""

    url = f"https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{go_id}"
    response = requests.get(url, headers={"Accept": "application/json"})

    if response.ok:
        data = response.json()
        return data['results'][0]['name']  # Extract and return the term name
    else:
        print(f"Error fetching GO term name for {go_id}: {response.status_code}")
        return None

# Example usage:
# gene_results = search_uniprot_gene("GUS")
# print(gene_results)





















In [None]:
# Running this functino on all the filtered cds now

import pickle
from IPython.display import clear_output
# Load the pickle file
with open('/scratch/alopatki_lab/Sharma/summer_project/temp_files/cds_dict_cleaned_v1.pkl', 'rb') as f:
    all_cds = pickle.load(f)

cds_go_annotations = {}

for i in range(len(all_cds)):
    print(f"Currently Processing cds: {i+1}/{len(all_cds)}")
    curr_cds = list(all_cds.keys())[i]
    result = search_uniprot_gene(curr_cds)
    print(result)
    cds_go_annotations[curr_cds] = result
    clear_output(wait = True)  






In [None]:
for i in range(262,len(all_cds)):
    print(f"Currently Processing cds: {i+1}/{len(all_cds)}")
    curr_cds = list(all_cds.keys())[i]
    result = search_uniprot_gene(curr_cds)
    print(result)
    cds_go_annotations[curr_cds] = result
    clear_output(wait = True)

In [None]:
with open("/scratch/alopatki_lab/Sharma/summer_project/temp_files/cds_go_done.pkl", "wb") as file:
    # Pickle the dictionary and write it to the file
    pickle.dump(cds_go_annotations, file)

In [12]:
with open("/scratch/alopatki_lab/Sharma/summer_project/temp_files/cds_go_done.pkl", "rb") as file:
    # Pickle the dictionary and write it to the file
    cds_go_annotations = pickle.load(file)

In [13]:
# Making oriflag column for the CDS features based on go annotations
# First we will have to make a dictionary for the go annotations for each cds we have in cds_go_annotations

cds_features = list(cds_go_annotations.keys())
# cds_go_annotations[cds_features[0]]

cds_go_annotations_dict_cleaned = {}
for i in range(len(cds_features)):
    curr_feature = cds_features[i]
    go_annotations_raw = cds_go_annotations[curr_feature]
    # print(type(go_annotations_raw))
    cds_go_annotations_dict_cleaned[curr_feature] = None

    if go_annotations_raw == None:
        continue
    
    # Extracting all the biological_process and molecular_function

    list_of_dicts = go_annotations_raw[list(go_annotations_raw.keys())[0]]
    feature_dict = {}
    for each_dict in list_of_dicts:
        if each_dict['goAspect'] in feature_dict.keys():
            feature_dict[each_dict['goAspect']].append(each_dict['goName'])
        else:
            feature_dict[each_dict['goAspect']] = [each_dict['goName']]

    for each_feature in feature_dict.keys():
        unique_val = set(feature_dict[each_feature])
        feature_dict[each_feature] = list(unique_val)
    
    cds_go_annotations_dict_cleaned[curr_feature] = feature_dict
# print(feature_types)

In [14]:
none_count = 0
for key in cds_go_annotations_dict_cleaned.keys():
    if cds_go_annotations_dict_cleaned[key] == None:
        none_count += 1
print(none_count)

196


In [16]:
import re

def is_replication_related2(text):
  """
  Checks if the given text is related to replication, ignoring text within brackets.
  Improved for higher accuracy and reduced false positives.

  Args:
    text: The text to check.

  Returns:
    True if the text is related to replication, False otherwise.
  """

  # Remove text within brackets
  text = re.sub(r'\[.*?\]', '', text)

  # Define specific patterns for replication-related proteins
  patterns = [
      r"\brop\b",  # Match "rop" as a whole word
      r"replication\s*(?:protein|factor|initiator)?", 
      r"plasmid\s*(?:replication|copy\s*number|partition|segregation)?",
      r"ori\s*(?:region|sequence)?",
      r"dnaa\s*(?:protein)?",
      r"dnab\s*(?:helicase)?",
      r"dnac\s*(?:protein)?",
      r"dna\s*polymerase\s*(?:i|iii)?", 
      r"primase",
      r"replisome",
      r"topoisomerase",
      r"gyrase",
      r"dna\s*binding\s*protein\s*(?:involved\s*in\s*replication)?",
  ]

  # Check for matches with the specific patterns
  if any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns):
    return True

  # Additional check for terms that might indicate replication but require context
  # These terms are more likely to be false positives on their own
  potential_terms = ["conjugation", "transformation", "transduction"]
  for term in potential_terms:
    if re.search(term, text, re.IGNORECASE):
      # Add more specific checks based on your domain knowledge
      # For example, check if the term is near other replication-related words
      if "plasmid" in text.lower() or "dna" in text.lower():
        return True

  return False

In [23]:
full_df["Go Annotations"] = None
full_df["Ori Flag"] = 0
for i in range(full_df.shape[0]):
    curr_row = full_df.iloc[i]

    if curr_row["Feature Type"] == "rep_origin" or curr_row["Feature Type"] == "oriT":
        full_df.at[i, 'Ori Flag'] = 1
    if curr_row["Feature Type"] == "CDS":
        if curr_row["Feature Label"] in cds_go_annotations_dict_cleaned.keys():
            full_df.at[i, "Go Annotations"] = cds_go_annotations_dict_cleaned[curr_row["Feature Label"]]
            all_go_annotations = []
            if cds_go_annotations_dict_cleaned[curr_row["Feature Label"]] != None:
                for key in cds_go_annotations_dict_cleaned[curr_row["Feature Label"]].keys():
                    all_go_annotations = [x for x in cds_go_annotations_dict_cleaned[curr_row["Feature Label"]][key]]
                    
                
                for things in all_go_annotations:
                    if is_replication_related2(things):
                        full_df.at[i, 'Ori Flag'] = 1
                        break

In [24]:
full_df.to_csv("/scratch/alopatki_lab/Sharma/summer_project/db_uniprot_only_oriflag.csv", index = False)

In [10]:
cds_go_annotations[cds_features[0]]

{'Q07327': [{'goId': 'GO:0000149',
   'goAspect': 'molecular_function',
   'goName': 'SNARE binding'},
  {'goId': 'GO:0019905',
   'goAspect': 'molecular_function',
   'goName': 'syntaxin binding'},
  {'goId': 'GO:0006886',
   'goAspect': 'biological_process',
   'goName': 'intracellular protein transport'},
  {'goId': 'GO:0006887',
   'goAspect': 'biological_process',
   'goName': 'exocytosis'},
  {'goId': 'GO:0006904',
   'goAspect': 'biological_process',
   'goName': 'vesicle docking involved in exocytosis'},
  {'goId': 'GO:0007268',
   'goAspect': 'biological_process',
   'goName': 'chemical synaptic transmission'},
  {'goId': 'GO:0007268',
   'goAspect': 'biological_process',
   'goName': 'chemical synaptic transmission'},
  {'goId': 'GO:0007269',
   'goAspect': 'biological_process',
   'goName': 'neurotransmitter secretion'},
  {'goId': 'GO:0007269',
   'goAspect': 'biological_process',
   'goName': 'neurotransmitter secretion'},
  {'goId': 'GO:0007269',
   'goAspect': 'biologica

In [3]:
# Merging the result from both the dataframes into a single dataframe
full_df = pd.read_csv("/scratch/alopatki_lab/Sharma/summer_project/db_uniprot_only_oriflag.csv")
full_df2 = pd.read_csv("/scratch/alopatki_lab/Sharma/summer_project/db_blast_only_oriflag.csv") 



In [10]:
col1, col2 = full_df2.columns, full_df.columns
# full_df2.columns[:-3]
# common_cols = set(col1).intersection(set(col2))
df_combined = full_df2[full_df2.columns[:-3]]
df_combined["Blast Titles"] = full_df2['Blast titles']
df_combined['Blast Prot Seq'] = full_df2['Blast Prot Seq']
df_combined['Go Annotations'] = full_df['Go Annotations']
df_combined["Involved in Replication"] = 0
# Finally figuring out the ori flag column
for i in range(df_combined.shape[0]):
    flag_1, flag_2 = full_df.at[i, "Ori Flag"], full_df2.at[i, "Ori Flag"]

    if flag_1 == 1 or flag_2 == 1:
        df_combined.at[i, "Involved in Replication"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined["Blast Titles"] = full_df2['Blast titles']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['Blast Prot Seq'] = full_df2['Blast Prot Seq']


In [11]:
df_combined.to_csv("/scratch/alopatki_lab/Sharma/summer_project/db_w_blast_uniprot_oriflag.csv", index = False)

In [15]:
# Data Analysis for ppt (13th June 2024)
with open("/scratch/alopatki_lab/Sharma/summer_project/temp_files/cds_blast_done.pkl", "rb") as file:
    # Pickle the dictionary and write it to the file
    cds_blast_done = pickle.load(file)




In [20]:
print(cds_blast_done["TEV site"])

[[], []]


In [28]:
empty_blast = 0
for key in cds_blast_done.keys():
    if cds_blast_done[key] == [[], []]:
        print(key, end = ", ")

TEV site, SV40 NLS, HA, nucleoplasmin NLS, OVA peptide, VSV-G tag, enterokinase site, WELQut site, TEE, ssrA tag, minicistron, NLS, 9xHis, tetracysteine tag, Tag-100, myr, HQ tag, -lambda N peptide, c-myc NLS, TVMV site, Genenase(TM) I site, NES, Glu-Glu tag, Spot-Tag, 

In [None]:
empty_anno_keys = []
for key in cds_go_annotations.keys():
    if cds_go_annotations[key] == None or len(cds_go_annotations[key]) == 0:
        empty_anno_keys.append(key)

In [None]:
from IPython.display import clear_output

for i in range(len(empty_anno_keys)):
    curr_key = empty_anno_keys[i]
    print(f"Current key: {curr_key}")
    uniprot_id = input("Uniprot ID for curr_key? Enter -1 if no ID found: ")
    if uniprot_id == -1:
        cds_go_annotations[curr_key] == []
        clear_output(wait = True)
    else:
        annotations = get_go_annotations(uniprot_id)
        cds_go_annotations[curr_key] = annotations
        clear_output(wait = True)

In [None]:
# set(cds_go_annotations.keys())

In [None]:
# Now we will psate the code to do blast in here and integrate the uniprot code in it and run it on a small sample dataset
# Defing functions we will use 

def extract_cds(row):
    if row["Feature Type"] == "CDS":
        if row["Feature Label"] in cds_to_keep.keys():
            return row["Feature Sequence"]
        else:
            return -1
    else:
        return -1


def blast_and_get_go_annotations(nucleotide_sequence):
    # Perform BlastX search
    result_handle = NCBIWWW.qblast("blastx", "nr", nucleotide_sequence)
    blast_record = NCBIXML.read(result_handle)

    # Get top 10 results and store description and protein sequence
    blast_results = {}
    for alignment in blast_record.alignments[:10]:
        for hsp in alignment.hsps:
            blast_results[alignment.title] = {
                "description": alignment.title,
                "protein_sequence": hsp.sbjct
            }
    # print(blast_results)
    # Perform GO annotation lookup on UniProt REST API
    go_annotations = {}
    for protein_sequence in blast_results.values():
        # Search UniProt REST API for the protein sequence
        uniprot_entries = search_uniprot(protein_sequence["protein_sequence"])
        if uniprot_entries:
            # Retrieve GO annotations for the UniProt entries
            go_terms = get_go_annotations(uniprot_entries)
            go_annotations[protein_sequence["protein_sequence"]] = go_terms
    # print(go_annotations)
    return [blast_results, go_annotations]



def search_uniprot(protein_sequence):
    """
    Searches the UniProt REST API for the given protein sequence and returns the matching UniProt entries.
    """
    url = "https://rest.uniprot.org/align"
    headers = {
        "Content-Type": "application/json"
    }
    data = {
        "query": protein_sequence,
        "format": "json"
    }
    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        data = response.json()
        return data.get("results", [])
    else:
        print(f"Error querying UniProt: {response.status_code} - {response.text}")
    return []


# def search_uniprot(protein_sequence):
#     """
#     Searches the UniProt REST API for the given protein sequence and returns the matching UniProt entries.
#     """
#     url = "https://www.uniprot.org/uniprot/?format=json&query=sequence:{}".format(protein_sequence)
#     response = requests.get(url)
#     if response.status_code == 200:
#         data = response.json()
#         return data["results"]
#     return []

def get_go_annotations(uniprot_entries):
    """
    Retrieves the GO annotations for the given UniProt entries.
    """
    molecular_function = []
    biological_process = []
    for entry in uniprot_entries:
        if "goTerms" in entry:
            for go_term in entry["goTerms"]:
                if go_term["category"] == "molecular function":
                    molecular_function.append(go_term["id"])
                elif go_term["category"] == "biological process":
                    biological_process.append(go_term["id"])
    return {
        "molecular_function": molecular_function,
        "biological_process": biological_process
    }





In [None]:
!pip install uniprot

In [None]:
!pip install bioservices

In [None]:
!help bioservices

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import time

def uniprot_blast_search(protein_sequence):
    """Automates a BLAST search on UniProt and returns the top result."""

    driver = webdriver.Firefox()
    driver.get("https://www.uniprot.org/blast/")

    try:
        # Input protein sequence 
        sequence_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "sequence-submission-input"))
        )
        sequence_input.send_keys(protein_sequence)

        # Scroll until the "BLAST" button is visible and clickable
        submit_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@type='submit' and ./span[text()='BLAST']]"))
        )
        while True:
            try:
                submit_button.location_once_scrolled_into_view  # Scroll to the element
                if submit_button.is_enabled() and submit_button.is_displayed():
                    break  # Exit the loop if the button is clickable
            except StaleElementReferenceException:
                # Handle cases where the element might be detached during scrolling
                submit_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[@type='submit' and ./span[text()='BLAST']]"))
                )
            driver.execute_script("window.scrollBy(0, 100);")  # Scroll down a bit
            time.sleep(0.2)  # Small delay for smoother scrolling

        # Pause for 4 seconds
        time.sleep(3)

        # Click the BLAST button
        submit_button.click()

        # Wait for "Completed" link and click 
        try:
            completed_link = WebDriverWait(driver, 600).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='card__content']//span[@class='dashboard__body__status']/a"))
            )
            completed_link.click()
        except TimeoutException:
            print("Error: BLAST search did not complete within 600 seconds.")
            return None

        # Wait for the target link on the new page (adjust timeout if needed)
        try:
            target_link = WebDriverWait(driver, 30).until(
                EC.element_to_be_clickable((By.XPATH, "//td//span[@class='S2LRp S9Gr5']/a[@class='BqBnJ']")) 
            )
            target_link.click()

            time.sleep(10)  # Adjust as needed

            # Scroll to the keywords section
            keywords_header = WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.XPATH, "//h3[@data-article-id='keywords']"))
            )
            driver.execute_script("arguments[0].scrollIntoView();", keywords_header)
            time.sleep(60)
            # Wait for the info-list to be present
            info_list = WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.XPATH, "//h3[@data-article-id='keywords']/following-sibling::ul[@class='info-list']"))
            )

            # Extract keywords and values
            keyword_dict = {}
            for item in info_list.find_elements(By.TAG_NAME, "li"):
                keyword = item.find_element(By.CLASS_NAME, "decorated-list-item__title").text
                values = [a.text for a in item.find_elements(By.XPATH, ".//div[@class='decorated-list-item__content']//a")]
                keyword_dict[keyword] = values

            # Print or return the dictionary
            print(keyword_dict)

            time.sleep(60)
            return keyword_dict

        except TimeoutException:
            print("Error: Target link, keywords section, or info-list not found on the results page.")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        driver.quit()



# Example usage
protein_sequence = "MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAISMSDNTAANLLLTTIGGPKELTAFFHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW"
uniprot_blast_search(protein_sequence)

In [None]:
search_uniprot('VTKQEKTALNMARFIRSQTLTLLEKLNELDADEQADICESLHDHADELYRSCLARFGDDGENL')

In [None]:
sample_df = full_df[:1000]

In [None]:
seq = extract_cds(sample_df.iloc[47])
res = blast_and_get_go_annotations(seq)
blast_res, uniprot_res = res[0], res[1]


In [None]:
blast_res

In [None]:
full_df["Blast"] = None
# full["Uniprot"] = None
for i in range(len(full_df)):
    print(f"Currently processing row: {i+1}/{len(full_df)}")
    row = full_df.iloc[i]
    seq = extract_cds(row)
    if seq == -1:
        full_df.iloc[i]["Blast"] = None
        # sample_df.iloc[i]["Uniprot"] = None
    else:
        full_df.iloc[i]["Blast"] = blast_and_get_go_annotations(seq)[0]
        print(full_df.iloc[i])
        time.sleep(1)
    
    clear_output(wait = True)