In [None]:
# Install Required Libraries
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import requests
import time
import logging
import re
import pandas as pd
import spacy
import nltk
from Bio import Entrez
from IPython.core.display import display, HTML
from nltk.corpus import wordnet as wn
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Configure Entrez API Key
Entrez.email = "example@example.com"  # Replace with your actual email
Entrez.api_key = "your_ENSMEBL_API_key"  # Replace with your valid API key

from huggingface_hub import login
login("your_HUGGINGFACE_LOGIN_key")  # Use your token

# Set up logging
logging.basicConfig(level=logging.ERROR)

# Load NLP Models
### QC Check for spaCy ###
try:
    nlp = spacy.load("en_core_web_sm")
    logging.info("✅ spaCy model loaded successfully.")
except Exception as e:
    logging.error(f"❌ Failed to load spaCy model: {e}")
    nlp = None  # Fallback to None if loading fails

### QC Check for nltk (WordNet) ###
try:
    nltk.data.find("corpora/wordnet.zip")  # Check if WordNet is available
    nltk.download('omw-1.4') #Fixed the indentation here
    logging.info("✅ nltk WordNet found.")
except LookupError:
    logging.warning("🔴 nltk WordNet not found. Downloading now...")
    nltk.download("wordnet")
    print("✅ NLTK WordNet downloaded successfully!")

### QC Check for minilm ###
try:
    minilm_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    print("✅ MiniLM (BERT) model loaded successfully!")
except Exception as e:
    logging.warning(f"🔴 Failed to load Word2Vec model: {e}. Some functions may not work.")

# Ensembl API Server
ENSEMBL_SERVER = "https://rest.ensembl.org"

# Function to get gene symbol from TAIR ID using Ensembl API
def get_gene_symbol(tair_id):
    url = f"{ENSEMBL_SERVER}/xrefs/id/{tair_id}?content-type=application/json"
    response = requests.get(url)
    if response.status_code != 200:
        logging.error(f"❌ Error fetching gene symbol for {tair_id}: {response.status_code}")
        return None
    data = response.json()
    for entry in data:
        if entry["dbname"] in ["TAIR_SYMBOL", "Uniprot_gn"]:
            return entry["display_id"]
    return None

# Function to expand keywords using WordNet, spaCy, and MiniLM
def expand_keywords(keyword):
    expanded_keywords = set([keyword])

    # NLTK WordNet Expansion
    nltk_synonyms = set()
    for syn in wn.synsets(keyword):
        for lemma in syn.lemmas():
            nltk_synonyms.add(lemma.name())
    if nltk_synonyms:
        #print(f"🔍 NLTK WordNet expanded '{keyword}' to: {list(nltk_synonyms)}")
        expanded_keywords.update(nltk_synonyms)

    # spaCy Named Entity Recognition (NER)
    doc = nlp(keyword)
    spacy_entities = [(ent.text, ent.label_) for ent in doc.ents]
    #print(f"🔍 spaCy NER applied on '{keyword}': {spacy_entities}")

    # MiniLM Semantic Expansion
    related_words = ["ABA", "ABI1", "abscisic acid", "MAPK", "GA signaling"]  # Placeholder examples
    embeddings = minilm_model.encode([keyword] + related_words)
    similarities = cosine_similarity([embeddings[0]], embeddings[1:])
    similar_terms = [related_words[i] for i in similarities.argsort()[0][-3:]]
    #print(f"🔍 MiniLM expanded '{keyword}' to: {similar_terms}")
    expanded_keywords.update(similar_terms)

    return list(expanded_keywords)

# Define original keywords
keywords = [
    "ABA", "abscisic acid", "ABA signaling pathway", "Abscisic acid signaling", "ABI1", "MAPK cascade",
    "drought stress", "salt stress", "cold stress", "Seed dormancy", "leaf senescence",
    "stomata", "Guard cell signaling", "stomatal closure", "stomatal regulation", "MAPK"
]

# Initialize expanded keywords
expanded_keywords = set(keywords)
for kw in keywords:
    expanded_keywords.update(expand_keywords(kw))

print("✅ Final expanded keyword list:", expanded_keywords)

# Function to safely query NCBI with retries (handles 429 errors)
def safe_entrez_request(func, *args, **kwargs):
    max_retries = 5
    for attempt in range(max_retries):
        try:
            time.sleep(1)  # Prevent overloading NCBI servers
            handle = func(*args, **kwargs)
            return handle
        except Exception as e:
            if "429" in str(e):
                wait_time = 2 ** attempt
                logging.warning(f"429 Error: Too Many Requests. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                logging.error(f"Error fetching data: {e}")
                break
    return None

# Function to search PubMed
def search_pubmed(identifier, keywords, expanded_keywords, retmax):
    query = f'({identifier}) AND ("{" OR ".join(keywords)}")'
    expanded_query = f'({identifier}) AND ("{" OR ".join(expanded_keywords)}")'

    print(f"\n🔍 Searching PubMed for: {identifier}")
    #print(f"  📌 Original Query: {query}")
    #print(f"  📌 Expanded Query: {expanded_query}")

    def execute_search(query_string):
        handle = safe_entrez_request(Entrez.esearch, db="pubmed", term=query_string, retmax=retmax)
        if handle:
            record = Entrez.read(handle)
            handle.close()
            return set(record.get("IdList", []))
        return set()

    original_results = execute_search(query)
    expanded_results = execute_search(expanded_query)

    print(f"  ✅ Results for {identifier}: {len(original_results)} (original) | {len(expanded_results)} (expanded)")
    return original_results, expanded_results

# Function to fetch PubMed details
def fetch_pubmed_details(pubmed_id):
    try:
        handle = safe_entrez_request(Entrez.efetch, db="pubmed", id=pubmed_id, rettype="medline", retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        if "PubmedArticle" in records and records["PubmedArticle"]:
            article = records["PubmedArticle"][0]
            citation = article["MedlineCitation"]
            article_data = citation["Article"]
            title = article_data["ArticleTitle"]
            abstract = article_data.get("Abstract", {}).get("AbstractText", [""])[0]
            pub_year = citation.get("Article", {}).get("Journal", {}).get("JournalIssue", {}).get("PubDate", {}).get("Year", "N/A")
            doi = next((id_ for id_ in article_data.get("ELocationID", []) if id_.attributes["EIdType"] == "doi"), "N/A")
            return title, abstract, doi, pub_year
    except Exception as e:
        logging.error(f"Error fetching PubMed details for {pubmed_id}: {e}")
    return None, None, "N/A", "N/A"

# Function to find matched keywords in the abstract
def find_matched_keywords(abstract, keywords):
    matched_keywords = [keyword for keyword in keywords if keyword.lower() in abstract.lower()]
    return ", ".join(matched_keywords) if matched_keywords else "N/A"

# ✅ Step 1: User Input
tair_input = input("Enter comma-separated TAIR IDs or gene symbols: ").strip()
retmax = int(input("Enter the number of articles to retrieve per search (e.g., 5, 10, 15): "))
print(f"Using keywords: {', '.join(keywords)}\n")

# ✅ Step 2: Separate TAIR IDs and Gene Symbols
tair_ids = [id_.strip() for id_ in tair_input.split(",") if re.match(r"AT\dG\d{5}", id_.strip())]
gene_symbols = [id_.strip() for id_ in tair_input.split(",") if id_.strip() not in tair_ids]

print("\n✅ Extracted Identifiers:")
print(f"  🔵 TAIR IDs: {tair_ids}")
print(f"  🟢 Gene Symbols: {gene_symbols}")

# ✅ Step 3: Convert TAIR IDs to Gene Symbols (if available)
converted_symbols = {}
for tair in tair_ids:
    gene_symbol = get_gene_symbol(tair)
    if gene_symbol:
        converted_symbols[tair] = gene_symbol
        gene_symbols.append(gene_symbol)  # Add the gene symbol to the search list
    else:
        print(f"  ⚠️ No gene symbol found for {tair}. Using TAIR ID for search.")

print("\n✅ Converted TAIR IDs to Gene Symbols:")
for tair, symbol in converted_symbols.items():
    print(f"  🔄 {tair} → {symbol}")

all_results = []

# ✅ Step 4: Process both TAIR IDs and Gene Symbols
identifiers = tair_ids + gene_symbols  # Process them together

for identifier in identifiers:
    print(f"\n🚀 Processing: {identifier}")

    original_results, expanded_results = search_pubmed(identifier, keywords, expanded_keywords, retmax)

    # ✅ Step 5: Process Original Keyword Results
    for pubmed_id in original_results:
        title, abstract, doi, pub_year = fetch_pubmed_details(pubmed_id)
        if title and abstract:
            matched_keywords = find_matched_keywords(abstract, keywords)
            all_results.append({
                "Identifier": identifier,
                "Query Type": "Original Keywords",
                "Matched Keywords": matched_keywords,
                "PubMed ID": f'<a href="https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}">{pubmed_id}</a>',
                "Title": title,
                "Year": pub_year,
                "DOI": f'<a href="https://doi.org/{doi}">{doi}</a>' if doi != "N/A" else "N/A"
            })

    # ✅ Step 6: Process Expanded Keyword Results
    for pubmed_id in expanded_results:
        title, abstract, doi, pub_year = fetch_pubmed_details(pubmed_id)
        if title and abstract:
            matched_keywords = find_matched_keywords(abstract, expanded_keywords)
            all_results.append({
                "Identifier": identifier,
                "Query Type": "Expanded Keywords",
                "Matched Keywords": matched_keywords,
                "PubMed ID": f'<a href="https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}">{pubmed_id}</a>',
                "Title": title,
                "Year": pub_year,
                "DOI": f'<a href="https://doi.org/{doi}">{doi}</a>' if doi != "N/A" else "N/A"
            })

# ✅ Step 7: Convert results to DataFrame and remove duplicates
df = pd.DataFrame(all_results)

if df.empty:
    print("\n❌ No abstracts found.")
else:
    # Remove duplicate PubMed IDs
    before_deduplication = len(df)
    df.drop_duplicates(subset=["PubMed ID"], keep="first", inplace=True)
    after_deduplication = len(df)

    # Check if duplicates were removed and print message
    duplicates_removed = before_deduplication - after_deduplication
    if duplicates_removed > 0:
        print(f"\n⚠️ {duplicates_removed} duplicate PubMed ID(s) removed.")

    # Display final results
    display(HTML(df.to_html(escape=False)))

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


✅ MiniLM (BERT) model loaded successfully!
✅ Final expanded keyword list: {'Seed dormancy', 'drought stress', 'stomate', 'pore', 'stomatal closure', 'MAPK cascade', 'stomatal regulation', 'Guard cell signaling', 'ABA signaling pathway', 'Abscisic acid signaling', 'salt stress', 'aba', 'ABA', 'leaf senescence', 'stomata', 'GA signaling', 'stoma', 'abscisic acid', 'ABI1', 'MAPK', 'cold stress'}
Enter comma-separated TAIR IDs or gene symbols: AT4G26080, mapkkk18
Enter the number of articles to retrieve per search (e.g., 5, 10, 15): 5
Using keywords: ABA, abscisic acid, ABA signaling pathway, Abscisic acid signaling, ABI1, MAPK cascade, drought stress, salt stress, cold stress, Seed dormancy, leaf senescence, stomata, Guard cell signaling, stomatal closure, stomatal regulation, MAPK


✅ Extracted Identifiers:
  🔵 TAIR IDs: ['AT4G26080']
  🟢 Gene Symbols: ['mapkkk18']

✅ Converted TAIR IDs to Gene Symbols:
  🔄 AT4G26080 → ABI1

🚀 Processing: AT4G26080

🔍 Searching PubMed for: AT4G26080
  ✅ 

Unnamed: 0,Identifier,Query Type,Matched Keywords,PubMed ID,Title,Year,DOI
0,mapkkk18,Original Keywords,"ABA, abscisic acid, leaf senescence, MAPK",36841482,The abscisic acid-responsive element binding factors MAPKKK18 module regulates abscisic acid-induced leaf senescence in Arabidopsis.,2023,10.1016/j.jbc.2023.103060
1,mapkkk18,Original Keywords,"ABA, ABA signaling pathway, ABI1, stomata",34281207,Identification of Novel miRNAs and Their Target Genes in the Response to Abscisic Acid in Arabidopsis.,2021,10.3390/ijms22137153
2,mapkkk18,Original Keywords,"ABA, abscisic acid, MAPK cascade, MAPK",38153765,Arabidopsis HECT and RING-type E3 Ligases Promote MAPKKK18 Degradation to Regulate Abscisic Acid Signaling.,2024,10.1093/pcp/pcad165
3,mapkkk18,Original Keywords,MAPK,39561685,Ethyl acetate extract of Artemisia argyi improves the resistance of cotton to Verticillium dahliae by activating the immune response.,2024,10.1016/j.plaphy.2024.109296
4,mapkkk18,Original Keywords,"ABA, abscisic acid, MAPK",38096479,Stem-cell-expressed DEVIL-like small peptides maintain root growth under abiotic stress via abscisic acid signaling.,2024,10.1093/plphys/kiad659
10,ABI1,Original Keywords,"ABA, abscisic acid",39879308,SnRK2 kinases sense molecular crowding and form condensates to disrupt ABI1 inhibition.,2025,10.1126/sciadv.adr8250
11,ABI1,Original Keywords,"ABA, abscisic acid, ABA signaling pathway, ABI1",40057048,Allelopathic inhibitory of thymol on Arabidopsis thaliana primary root growth is mediated by ABA signaling pathway.,2025,10.1016/j.plantsci.2025.112453
12,ABI1,Original Keywords,"ABI1, drought stress",39898271,Effects of exogenous spraying of melatonin on the growth of Platycrater arguta under drought stress.,2024,10.3389/fpls.2024.1516302
13,ABI1,Original Keywords,ABI1,39568082,Programmed cell death-related ABI1 is a critical mediator of abdominal aortic aneurysm.,2024,10.1186/s40001-024-02128-4
14,ABI1,Original Keywords,"ABI1, salt stress",39861571,Functional Characterization of the PoWHY1 Gene from Platycladus orientalis and Its Role in Abiotic Stress Tolerance in Transgenic Arabidopsis thaliana.,2025,10.3390/plants14020218
