In [1]:
!pip install tools

Collecting tools
  Downloading tools-0.1.9.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pytils (from tools)
  Downloading pytils-0.4.3.tar.gz (101 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting lxml (from tools)
  Downloading lxml-5.3.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.6 kB)
Downloading lxml-5.3.2-cp311-cp311-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m602.3 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: tools, pytils
  Building wheel for tools (setup.py) ... [?25ldone
[?25h  Created wheel for tools: filename=tools-0.1.9-py3-none-any.whl size=46729 sha256=e9e00a8a2a03cd3a7e51b0cdc30fa5f22ada45efd2ca3d98a78e8465e661c71e
  Stored in directory: /home/bharath-sooryaa-m/.cache/pip/wheels/bc

In [2]:
import os
import json
from pydantic import BaseModel
from crewai.tools.structured_tool import CrewStructuredTool
from crewai import Agent, Task, Crew, LLM
import requests
import urllib.parse
from pydantic import BaseModel, Field
from typing import List, Literal
import tools.alpha_fold_fetch as alpha_fold_fetch
from tools.Query_format import QueryItem, APIQuery

# LLM Setup
llm = LLM(model="grok/gemma2-9b-it", temperature=0.7)
os.environ["GEMINI_API_KEY"] = "AIzaSyDQSOPPaW8BVWXny1ycBfO_tF9jJiYiuag"
os.environ["MEM0_API_KEY"] = "m0-3wFDlHPJEB4GMP6HKSsHpGlm3Kt1s6xmvYQi5IuP"
gemini_llm = LLM(model="gemini/gemini-2.0-flash", temperature=0.7)

# RF Diffusion Manual
rf_diff_context_path = r"config/RF_diff_context.txt"
with open(rf_diff_context_path, 'r') as f:
    RF_Dif_manual = f.read()

from storage_manager import StorageManager
storage = StorageManager()
current_path = os.path.join(os.getcwd(), str(storage.get_session_path()))

from crewai.tools import tool
from tools.uniprot_extended_tool import toolset
from tools.rossetta_tool import PyRosettaWrapper

crewtool = toolset(current_path)
Rosettatool = PyRosettaWrapper(current_path)

# Enhanced Tools
@tool("uniprot_fetch_tool")
def uniprot_fetch_tool(query: str) -> str:
    """Fetches data from UniProtKB using the UniProt REST API. Input must be a string in parentheses (e.g., '(go:0003677)')."""
    if not (query.startswith('(') and query.endswith(')')):
        return "Error: Query must be a string in parentheses (e.g., '(go:0003677)')"
    try:
        result = crewtool.uniprot_fetch_tool(query=query)
        return result if result else "Error: No results found for query"
    except Exception as e:
        return f"Error: Failed to fetch data - {str(e)}"

@tool("rf_diff_tool")
def rf_diff_tool(script: str, protein_id: str) -> str:
    """Runs an RF Diffusion script with the given protein ID. Returns execution logs or errors."""
    try:
        result = crewtool.rfdifcom(script=script, pdb_id=protein_id)
        return result if result else "Error: Script execution returned no output"
    except Exception as e:
        return f"Error: Script failed - {str(e)}"

# Agents
query_generator = Agent(
    role="uniprot_query_generator",
    goal="Generates a UniProt query from a given protein function: {userinput}. Map the function to relevant UniProt fields like 'go' for GO terms (e.g., 'GO:0003677' for DNA binding) or 'keyword' for specific keywords (e.g., 'DNA-binding'). Generate multiple query options within parentheses (e.g., '(go:0003677)') and select the one retrieving the most relevant proteins.",
    backstory="Designed as a highly specialized bioinformatics assistant to construct precise UniProt queries.",
    verbose=True,
    llm=gemini_llm,
)

uniprot_query_assurance_agent = Agent(
    role="query_assurance_agent",
    goal="Ensures the UniProt query is accurate and relevant to the protein function: {userinput}. Run the uniprot_fetch_tool with the query (e.g., '(go:0003677)'), analyze retrieved proteins' annotations for relevance (e.g., DNA binding terms), and suggest improvements if suboptimal. Return the protein ID of the best match after validation.",
    backstory="This agent acts as a quality control specialist, ensuring queries target the right proteins.",
    tools=[uniprot_fetch_tool],
    verbose=True,
    llm=gemini_llm,
)

protein_selector_agent = Agent(
    role="Protein Selection Expert",
    goal="Select the best protein matching the function: {function} from the list retrieved by uniprot_fetch_tool. Use get_protein__function_info to evaluate annotations.",
    backstory="Expert in protein biochemistry, adept at matching proteins to desired functions.",
    verbose=True,
    llm=gemini_llm,
    tools=[get_protein__function_info],
)

protein_expert_agent = Agent(
    role="Protein Scaffolding Specialist",
    goal="Analyze UniProt features for the protein ID from previous tasks. Identify functional sites (e.g., DNA-binding domains for {function}) to preserve and regions to mask.",
    backstory="Expert in computational protein design, analyzing structural and functional data.",
    llm=gemini_llm,
    tools=[get_protein__site_info],
    verbose=True,
)

RF_Diffusion_Expert = Agent(
    role="RF Diffusion Expert",
    goal="Translate protein scaffolding requirements into an RF Diffusion script. Use the RF_Dif_manual to include all necessary parameters: model settings, diffusion parameters, input PDB (UniProt ID from previous task), output directory, and task-specific settings (e.g., contigmap for binders). Example: 'python run_rf_diffusion.py --pdb {protein_id} --output_dir ./output --contigmap 10-50'.",
    backstory="Expert in diffusion-based protein design, leveraging the RF Diffusion manual.",
    llm=gemini_llm,
    verbose=True,
)

RF_Diffusion_Script_Verifier = Agent(
    role="RF Diffusion Script Verifier",
    goal="Verify and clean the RF Diffusion script. Check syntax, parameters, and paths. Run the script with rf_diff_tool using the protein ID from previous tasks. Fix errors (up to 10 retries) based on logs, then return the validated script and execution logs or a detailed error report if unsuccessful.",
    backstory="Expert in script validation and execution, ensuring RF Diffusion runs smoothly.",
    llm=gemini_llm,
    tools=[rf_diff_tool],
    verbose=True,
)

# Tasks
plan = Task(
    description=(
        "1. Extract key biological terms from the protein function description: {userinput}.\n"
        "2. Map these terms to UniProt search fields (e.g., 'go' for GO terms like 'GO:0003677' for DNA binding).\n"
        "3. Generate a string UniProt query within parentheses (e.g., '(go:0003677)') optimized for accuracy and recall.\n"
        "4. Test multiple query options and select the most relevant based on retrieved protein annotations."
    ),
    expected_output="A precise UniProt query string in parentheses (e.g., '(go:0003677)')",
    agent=query_generator,
)

query_review = Task(
    description=(
        "Review the UniProt query generated by the uniprot_query_generator agent.\n"
        "1. Receive the query (e.g., '(go:0003677)') and original function description: {userinput}.\n"
        "2. Run uniprot_fetch_tool with the query.\n"
        "3. Analyze retrieved proteins’ annotations for relevance to the target function (e.g., presence of 'DNA binding').\n"
        "4. If suboptimal, suggest specific improvements (e.g., add 'organism:human' or refine GO terms).\n"
        "5. Repeat until a relevant protein list is retrieved, then return the best protein ID."
    ),
    expected_output="The validated UniProt query and the selected protein ID (e.g., 'P12345')",
    agent=uniprot_query_assurance_agent,
    context=[plan],
)

selection_task = Task(
    description="Choose the best protein from the list retrieved in query_review that performs the target function: {function}. Limit to 1-3 proteins.",
    expected_output="A JSON object with the selected protein ID and its function details",
    agent=protein_selector_agent,
    context=[query_review],
)

protein_analysis_task = Task(
    description="Analyze UniProt features for the protein ID from selection_task. Identify functional sites (e.g., DNA-binding domains for {function}) to preserve and regions to mask.",
    expected_output="A JSON report with motifs to preserve/mask and scaffolding approach",
    agent=protein_expert_agent,
    context=[selection_task],
)

RF_Diffusion_configuration_task = Task(
    description=(
        "Using the protein ID and analysis from the previous task, generate an RF Diffusion script.\n"
        "1. Reference the RF_Dif_manual for parameter settings.\n"
        "2. Include: model settings, diffusion parameters, input PDB (UniProt ID, e.g., 'P12345'), output directory (e.g., './output'), and task-specific settings (e.g., '--contigmap 10-50' for DNA binding motifs).\n"
        "3. Ensure the script is executable in a terminal."
    ),
    expected_output="A valid RF Diffusion script (e.g., 'python run_rf_diffusion.py --pdb P12345 --output_dir ./output --contigmap 10-50')",
    agent=RF_Diffusion_Expert,
    context=[protein_analysis_task],
)

RF_Diffusion_script_verification_task = Task(
    description=(
        "1. Take the RF Diffusion script from the previous task.\n"
        "2. Check syntax, parameters, and file paths (e.g., valid PDB ID and output directory).\n"
        "3. Run the script using rf_diff_tool with the protein ID from previous tasks.\n"
        "4. If errors occur, fix based on logs and retry (max 10 attempts).\n"
        "5. Return the cleaned script and execution logs if successful, or a detailed error report if unsuccessful."
    ),
    expected_output="A validated RF Diffusion script and execution logs, or an error report after 10 retries",
    agent=RF_Diffusion_Script_Verifier,
    context=[RF_Diffusion_configuration_task],
)

# Crew Setup
combined_crew = Crew(
    agents=[query_generator, uniprot_query_assurance_agent, protein_selector_agent, protein_expert_agent, RF_Diffusion_Expert, RF_Diffusion_Script_Verifier],
    tasks=[plan, query_review, selection_task, protein_analysis_task, RF_Diffusion_configuration_task, RF_Diffusion_script_verification_task],
    verbose=True,
    memory=True,
)

# Inputs
combined_inputs = {
    "userinput": "I want a protein that can help in DNA binding",
    "function": "I want a protein that can help in DNA binding",
    "Base protein": "P49593",
    "RF_Dif_manual": RF_Dif_manual
}

# Execute
result = combined_crew.kickoff(inputs=combined_inputs)
print(result)

ModuleNotFoundError: No module named 'tools.alpha_fold_fetch'