In [1]:
import os
import json
from pydantic import BaseModel
from crewai.tools.structured_tool import CrewStructuredTool
from crewai import Agent, Task, Crew, LLM
import requests
import urllib.parse
from pydantic import BaseModel, Field
from typing import List, Literal
from pathlib import Path
from crewai.tools import tool

In [2]:
from storage_manager import StorageManager
from tools.uniprot_extended_tool import toolset
from tools.rossetta_tool import PyRosettaWrapper
from tools.rf_dif_tool import run_rf_diffusion
from config.rf_diff_script_format import RFDiffusionScriptConfig

In [3]:
os.environ["GEMINI_API_KEY"] = "Paste you gemini key"
gemini_llm = LLM(
    model="gemini/gemini-2.0-flash",
    temperature=0.7
)

In [4]:
rf_diff_context_path = r"config/RF_diff_context.txt"
with open(rf_diff_context_path, 'r') as f:
    RF_Dif_manual = f.read()

In [5]:
storage = StorageManager()
current_path = os.path.join(os.getcwd(), str(storage.get_session_path()))
print(current_path)

/home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258


TOOLS

In [6]:

crewtool = toolset(current_path)
Rosettatool = PyRosettaWrapper(current_path)

@tool("fetchUniProt")
def fetchUniProt(query: str) -> str:
    """This tool provides an acces to fetch data from UniProtKB using the UniProt REST API.
    the input has to be strictly a string """
    return crewtool.uniprot_fetch_tool(query=query)


@tool("getSiteInfo")
def getSiteInfo(protein_id: str) -> str:
    """Fetches the protein site information from cache."""
    return crewtool.get_protein__site_info(protein_id=protein_id)

@tool("getFuncInfo")
def getFuncInfo(protein_id: str) -> str:
    """Fetches the protein function information from cache."""
    return crewtool.get_all_function()

@tool("rfDiffTool")
def rfDiffTool(script:dict,protein_id: str,) -> str:
    """This tool provides an access to fetch data from RF_diff
    the input has to be strictly a RFDiffusionScriptConfig first is the script and the second is the protein_id, the tool will automatically access the pdb file """
    print(RFDiffusionScriptConfig.get_script_with_dict(script,dir=current_path))
    return run_rf_diffusion(script_str=RFDiffusionScriptConfig.get_script_with_dict(script,dir=current_path), pdb_file_path= current_path+"/pdb/"+protein_id+".pdb")

@tool("rossetaTool")
def rossetaTool(pdb_id: str) -> str:
    """This tool provides an access to fetch data from Rosetta
    the input has to be strictly a string first is the script and the second is the protein_id, the tool will automatically access the pdb file """
    return Rosettatool.run(pdb_file=pdb_id)


PyRosettaWrapper initialized with directory: /home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258


AGENTS

In [7]:

queryGen = Agent(
    role="queryGen",
    goal="Generates a UniProt query from a given protein function: {userinput}. Ensure the query retrieves relevant proteins",
    backstory="Designed as a highly specialized bioinformatics assistant to construct precise UniProt queries.",
    
    verbose=True,
    llm=gemini_llm
)

queryValidator = Agent(
    role="queryValidator",
    goal="Ensures the generated UniProt query is accurate and relevant to the protein function: {userinput}. Verify that the query retrieves the correct proteins and aligns with the intended function."
          "RUN the uniprot_fetch_tool, if it doesn't work then the query is suboptimal, provide suggestions to improve it."
          "uniprot_fetch_tool only works with a parathesis query in string format"
          "Query rules:"
          "-the query has to be string with paranthesis and semicolon and characters no other special characters allowed"
          "-avoid metioning \"query\" as it is unnecessary",
    backstory="This agent acts as a quality control specialist for bioinformatics queries, ensuring that the query targets the right proteins.",
    tools=[fetchUniProt],
    verbose=True,
    llm=gemini_llm,
)

proteinPicker = Agent(
    role="Protein Selection Expert",
    goal="Select the best protein that matches the given function requirement from a provided list. Access the list of proteins from the tool.",
    backstory="You are an expert in protein biochemistry and bioinformatics, with a keen ability to match protein functions to desired roles.",
    verbose=True,
    llm=gemini_llm,
    tools=[getFuncInfo]
)

scaffoldPlanner = Agent(
    role="Protein Scaffolding Specialist",
    goal="Assist in creating protein scaffolds by identifying which motifs should be masked or preserved based on UniProt features.",
    backstory="Expert in computational protein design with experience in analyzing protein structural and functional data.",
    llm=gemini_llm,
    tools=[getSiteInfo],
    tools_verbose=True,
    verbose=True
)

rfDiffuser = Agent(
    role="RF Diffusion Expert",
    goal=("Translate the protein scaffolding requirements into specific RF Diffusion implementation strategies, "
          "specifying which motifs to mask versus preserve and providing technical details."),
    backstory="Expert in diffusion-based generative modeling for protein design. Manual context: {RF_Dif_manual}",
    llm=gemini_llm,
    verbose=True
)

rfDiffVerifier = Agent(
    role="RF Diffusion Script Verifier",
    goal=(
        "Verify the generated RF Diffusion configuration script for correctness, "
        "remove any unnecessary or redundant lines, ensure all parameters and file paths "
        "are valid, then execute the script via the RF Diffusion tool. STOP IMMEDIATELY AFTER PDB FILE GENERATION."
    ),
    backstory="Expert in RF Diffusion process scripts, code review, and pipeline execution. Manual context: {RF_Dif_manual}",
    llm=gemini_llm,
    tools=[rfDiffTool],
    verbose=True,
)

TASKS

In [8]:
generateQuery = Task(
    description=(
        "1. Extract key biological terms from the protein function description: {userinput}.\n"
        "2. Map these terms to UniProt search fields and controlled vocabularies.\n"
        "3. Generate a string within paranthesis UniProt query optimized for accuracy and recall.\n"
        "4. Validate and refine the query to ensure relevant search results."
    ),
    expected_output="UniProt query for the given protein function description",
    agent=queryGen
)

validateQuery = Task(
    description="Review the UniProt query generated by the uniprot_query_generator agent.",
    expected_input="A UniProt query string generated from a protein function description, along with the original user input.",
    expected_output=(
                     "If the query is suboptimal, provide suggestions to improve it."
                     "After a successful query return the protein ID"),
    steps=[
        "Receive the generated UniProt query and the original protein function description.",
        "Submit the query to the UniProt database using uniprot_tool.",
        "Analyze the retrieved proteins and compare their functions to the intended protein function.",
        "resolve the query using the uniprot_fetch_tool unitil the problem is fixed",
        "Approve the query if it runs successfully and retrieves relevant proteins.",
    ],
    acceptance_criteria=[
        "The query retrieves proteins that strongly match the intended function.",
        "The query does not produce irrelevant or overly broad results.",
        "Suggestions for improvement are practical and enhance query precision.",
        "The validation report clearly explains the decision."
    ],
    agent=queryValidator
)

pickProtein = Task(
    description="Choose the best protein from the provided list that performs the target function:{function}. try to choose only limited proteins",
    expected_output="A JSON object detailing the selected protein and its function.",
    agent=proteinPicker
)

analyzeMotifs = Task(
    description=("Analyze the provided UniProt features in JSON format, identifying all functional sites (active sites, binding sites, metal-binding sites, etc.). "
                 "Interpret the user's desired protein function described in natural language and determine which motifs should be preserved as anchors and which regions masked for redesign. "
                 "Identify the most appropriate scaffolding approach and provide clear reasoning for your decisions, including motif positions. "
                 "unction to be preserved: {function}. use the protein id from previous task"),
    expected_output=("A detailed JSON report containing an analysis of UniProt features, the identified motifs to be preserved or masked with justification, "
                     "and the recommended scaffolding approach."),
    agent=scaffoldPlanner,
    context=[pickProtein]
    #context=[protein_analysis_task]
)

createDiffConfig = Task(
    description=("Using the analysis from the previous task, generate a configuration script for running the RF Diffusion process. "
                 "Include model settings, diffusion parameters, input file paths, output directory locations, and any optional parameters (such as contigmap)."),
    expected_output="A valid terminal script containing the RF Diffusion configuration with all necessary parameters."
     "You should make sure sure that the pdb file should be the uniprot ID which will be given from the previous task",
    agent=rfDiffuser,
    context=[analyzeMotifs],
    output_pydantic=RFDiffusionScriptConfig,
)

rfDiffVerification = Task(
    description=(
        "1) Take the output configuration script from the previous task.  "
        "2) Check syntax and parameter correctness, remove any unnecessary or redundant lines.  "
        "3) Run the cleaned script using rf_diffusion_tool.run(script_path).  "
        "4) IMMEDIATELY STOP PROCESSING ONCE PDB FILE IS GENERATED IN THE OUTPUT DIRECTORY."
        "5) Do NOT perform any additional verification steps beyond PDB file creation."
    ),
    expected_output=(
        "— A cleaned and validated RF Diffusion configuration script  \n"
        "— Confirmation of PDB file generation in the output directory"
    ),
    agent=rfDiffVerifier,
    context=[createDiffConfig],
    output_pydantic=RFDiffusionScriptConfig
)

CREW

In [9]:
combined_crew = Crew(
    agents=[queryGen, queryValidator, proteinPicker, scaffoldPlanner, rfDiffuser, rfDiffVerifier],
    tasks=[generateQuery, validateQuery, pickProtein, analyzeMotifs, createDiffConfig, rfDiffVerification],
    verbose=True,
)

RUN

In [10]:
combined_inputs = {
    "userinput": "I want a protein that can help in the DNA binding.",
    "function": "I want a protein that can help in the DNA binding.",
    "RF_Dif_manual": RF_Dif_manual,
}

In [11]:
result = combined_crew.kickoff(inputs=combined_inputs)
print(result)

[1m[95m# Agent:[00m [1m[92mqueryGen[00m
[95m## Task:[00m [92m1. Extract key biological terms from the protein function description: I want a protein that can help in the DNA binding..
2. Map these terms to UniProt search fields and controlled vocabularies.
3. Generate a string within paranthesis UniProt query optimized for accuracy and recall.
4. Validate and refine the query to ensure relevant search results.[00m




[1m[95m# Agent:[00m [1m[92mqueryGen[00m
[95m## Final Answer:[00m [92m
`(go:"DNA binding" ) OR (keyword:"DNA binding")`[00m




[1m[95m# Agent:[00m [1m[92mqueryValidator[00m
[95m## Task:[00m [92mReview the UniProt query generated by the uniprot_query_generator agent.[00m


https://rest.uniprot.org/uniprotkb/search?query=%28go%3A%22DNA+binding%22+%29+OR+%28keyword%3A%22DNA+binding%22%29&format=json&size=20
Data saved to /home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/uniprot/P07199.json
Data saved to /home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/uniprot/Q96BR9.json
Data saved to /home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/uniprot/O14628.json
Data saved to /home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/uniprot/P17022.json
Data saved to /home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/uniprot/P17040.json
Data saved to /home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/uniprot/Q



[1m[95m# Agent:[00m [1m[92mqueryValidator[00m
[95m## Thought:[00m [92mThe query aims to find proteins involved in DNA binding. To verify its accuracy and relevance, I need to execute the query using the `fetchUniProt` tool and analyze the results. If the results are satisfactory, meaning they primarily consist of proteins known to be involved in DNA binding, then the query is acceptable. Otherwise, I will suggest improvements.[00m
[95m## Using tool:[00m [92mfetchUniProt[00m
[95m## Tool Input:[00m [92m
"{\"query\": \"(go:\\\"DNA binding\\\" ) OR (keyword:\\\"DNA binding\\\")\"}"[00m
[95m## Tool Output:[00m [92m
No AlphaFoldDB entry found for the given entry ID.,provide a query with take protein with structure[00m




[1m[95m# Agent:[00m [1m[92mqueryValidator[00m
[95m## Final Answer:[00m [92m
The query `(go:"DNA binding" ) OR (keyword:"DNA binding")` is suboptimal. It is too broad and does not guarantee retrieval of proteins with known structures.

Suggestion: Refine the query by adding specific criteria, such as:

1.  Specifying the organism (e.g., `(go:"DNA binding") AND (organism:"Homo sapiens")`).
2.  Focusing on proteins with experimental evidence (e.g., `(go:"DNA binding") AND (evidence:experimental)`).
3.  Including specific protein names or families known for DNA binding (e.g., `(go:"DNA binding") AND (protein:"transcription factor")`).

A more specific query would be: `((go:"DNA binding") AND (organism:"Homo sapiens") AND (evidence:experimental))`[00m




[1m[95m# Agent:[00m [1m[92mProtein Selection Expert[00m
[95m## Task:[00m [92mChoose the best protein from the provided list that performs the target function:I want a protein that can help in the DNA binding.. try to choose only limited proteins[00m




[1m[95m# Agent:[00m [1m[92mProtein Selection Expert[00m
[95m## Thought:[00m [92mOkay, I understand the task. I need to select the best protein for DNA binding from a list (that I will access using the tool) based on the user's criteria. I will start by getting information on the proteins. Since the initial query was too broad, I will use the suggested refined query: `((go:"DNA binding") AND (organism:"Homo sapiens") AND (evidence:experimental))`. However, I don't have a list of proteins to choose from yet. I will start by fetching information for a protein known to be involved in DNA binding, for example, a transcription factor. I will use getFuncInfo to fetch the function information.[00m
[95m## Using tool:[00m [92mgetFuncInfo[00m
[95m## Tool Input:[00m [92m
"{\"protein_id\": \"P0DTC2\"}"[00m
[95m## Tool Output:[00m [92m
{'Q3KNS6': ['May be involved in transcriptional regulation'], 'Q6P2D0': ['May be involved in transcriptional regulation'], 'P17027': ['May be i



[1m[95m# Agent:[00m [1m[92mProtein Selection Expert[00m
[95m## Final Answer:[00m [92m
{"protein_id": "P07199", "function": "Interacts with centromeric heterochromatin in chromosomes and binds to a specific 17 bp subset of alphoid satellite DNA, called the CENP-B box (PubMed:11726497). May organize arrays of centromere satellite DNA into a higher-order structure which then directs centromere formation and kinetochore assembly in mammalian chromosomes (Probable)"}[00m




[1m[95m# Agent:[00m [1m[92mProtein Scaffolding Specialist[00m
[95m## Task:[00m [92mAnalyze the provided UniProt features in JSON format, identifying all functional sites (active sites, binding sites, metal-binding sites, etc.). Interpret the user's desired protein function described in natural language and determine which motifs should be preserved as anchors and which regions masked for redesign. Identify the most appropriate scaffolding approach and provide clear reasoning for your decisions, including motif positions. unction to be preserved: I want a protein that can help in the DNA binding.. use the protein id from previous task[00m


used




[1m[95m# Agent:[00m [1m[92mProtein Scaffolding Specialist[00m
[95m## Thought:[00m [92mI need to analyze the UniProt features of the protein with ID P07199 to identify functional sites relevant to DNA binding. Based on the user's request to preserve DNA binding function, I will identify motifs related to DNA binding and mark them for preservation. Other regions will be considered for masking to allow for redesign while maintaining the core functionality. Finally, I will recommend a scaffolding approach based on these considerations.[00m
[95m## Using tool:[00m [92mgetSiteInfo[00m
[95m## Tool Input:[00m [92m
"{\"protein_id\": \"P07199\"}"[00m
[95m## Tool Output:[00m [92m
[{'type': 'Initiator methionine', 'location': {'start': {'value': 1, 'modifier': 'EXACT'}, 'end': {'value': 1, 'modifier': 'EXACT'}}, 'description': 'Removed', 'evidences': [{'evidenceCode': 'ECO:0000269', 'source': 'PubMed', 'id': '23978223'}]}, {'type': 'Chain', 'location': {'start': {'value': 2, 



[1m[95m# Agent:[00m [1m[92mProtein Scaffolding Specialist[00m
[95m## Final Answer:[00m [92m
```json
{
  "UniProt_features_analysis": {
    "protein_id": "P07199",
    "function": "Interacts with centromeric heterochromatin in chromosomes and binds to a specific 17 bp subset of alphoid satellite DNA, called the CENP-B box. May organize arrays of centromere satellite DNA into a higher-order structure which then directs centromere formation and kinetochore assembly in mammalian chromosomes",
    "motifs_to_preserve": [
      {
        "type": "DNA binding",
        "location": {
          "start": 28,
          "end": 48
        },
        "description": "H-T-H motif",
        "justification": "Essential for DNA binding functionality."
      },
      {
        "type": "DNA binding",
        "location": {
          "start": 97,
          "end": 129
        },
        "description": "H-T-H motif",
        "justification": "Essential for DNA binding functionality."
      },
      

[1m[95m# Agent:[00m [1m[92mRF Diffusion Expert[00m
[95m## Task:[00m [92mUsing the analysis from the previous task, generate a configuration script for running the RF Diffusion process. Include model settings, diffusion parameters, input file paths, output directory locations, and any optional parameters (such as contigmap).[00m




[1m[95m# Agent:[00m [1m[92mRF Diffusion Expert[00m
[95m## Final Answer:[00m [92m
{
  "output_prefix": "outputs/P07199_scaffold",
  "input_pdb": "P07199.pdb",
  "contigmap_contigs": "[10-20/A28-48/5-10/A97-129/10-20/A536-599/5-15]",
  "num_designs": 20,
  "contigmap_length": null,
  "contigmap_inpaint_seq": "[A143-184/A387-475/A495-544]",
  "ckpt_override_path": null
}[00m




[1m[95m# Agent:[00m [1m[92mRF Diffusion Script Verifier[00m
[95m## Task:[00m [92m1) Take the output configuration script from the previous task.  2) Check syntax and parameter correctness, remove any unnecessary or redundant lines.  3) Run the cleaned script using rf_diffusion_tool.run(script_path).  4) IMMEDIATELY STOP PROCESSING ONCE PDB FILE IS GENERATED IN THE OUTPUT DIRECTORY.5) Do NOT perform any additional verification steps beyond PDB file creation.[00m


../scripts/run_inference.py  inference.output_prefix=/home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/output inference.input_pdb=input_pdbs/P07199.pdb 'contigmap.contigs=[10-20/A28-48/5-10/A97-129/10-20/A536-599/5-15]' inference.num_designs=1 'contigmap.inpaint_seq=[A143-184/A387-475/A495-544]'
Executing command: /home/bharath-sooryaa-m/anaconda3/envs/SE3nv/bin/python ../scripts/run_inference.py  inference.output_prefix=/home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents/cache/session_20250408-220920_55100258/output inference.input_pdb=input_pdbs/P07199.pdb 'contigmap.contigs=[10-20/A28-48/5-10/A97-129/10-20/A536-599/5-15]' inference.num_designs=1 'contigmap.inpaint_seq=[A143-184/A387-475/A495-544]'
[2025-04-08 22:12:28,511][__main__][INFO] - Found GPU with device_name NVIDIA GeForce RTX 3050 Ti Laptop GPU. Will run RFdiffusion on NVIDIA GeForce RTX 3050 Ti Laptop GPU
Reading models from /home/bharath-s



[1m[95m# Agent:[00m [1m[92mRF Diffusion Script Verifier[00m
[95m## Thought:[00m [92mI have received the RF Diffusion configuration. My task is to validate the script, clean it by removing unnecessary or redundant lines, and then execute it using the rfDiffTool. After the PDB file generation, I must stop immediately.[00m
[95m## Using tool:[00m [92mrfDiffTool[00m
[95m## Tool Input:[00m [92m
"{\"script\": {\"output_prefix\": \"outputs/P07199_scaffold\", \"input_pdb\": \"P07199.pdb\", \"contigmap_contigs\": \"[10-20/A28-48/5-10/A97-129/10-20/A536-599/5-15]\", \"num_designs\": 20, \"contigmap_length\": null, \"contigmap_inpaint_seq\": \"[A143-184/A387-475/A495-544]\", \"ckpt_override_path\": null}, \"protein_id\": \"P07199\"}"[00m
[95m## Tool Output:[00m [92m
SUCECSS[00m




[1m[95m# Agent:[00m [1m[92mRF Diffusion Script Verifier[00m
[95m## Final Answer:[00m [92m
{
  "output_prefix": "outputs/P07199_scaffold",
  "input_pdb": "P07199.pdb",
  "contigmap_contigs": "[10-20/A28-48/5-10/A97-129/10-20/A536-599/5-15]",
  "num_designs": 20,
  "contigmap_length": null,
  "contigmap_inpaint_seq": "[A143-184/A387-475/A495-544]",
  "ckpt_override_path": null
}[00m




output_prefix='outputs/P07199_scaffold' input_pdb='P07199.pdb' contigmap_contigs='[10-20/A28-48/5-10/A97-129/10-20/A536-599/5-15]' num_designs=20 contigmap_length=None contigmap_inpaint_seq='[A143-184/A387-475/A495-544]' ckpt_override_path=None


In [None]:
from rosseta.rosseta_run import PyRosettaWrapper
ros_obj = PyRosettaWrapper("")
ros_obj.run(pdb_file=r"{current_path}/output_0.pdb".format(current_path=current_path))

PyRosettaWrapper initialized with directory: 
┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2025 [Rosetta PyRosetta4.Release.python311.ubuntu 2025.13+release.80dd00bc09d2543ce4b9a50f13c49219ee5cdd3b 2025-03-27T08:32:50] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.Release.python311.ubu

core.pack.pack_missing_sidechains: packing residue number 19 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 21 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 22 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 23 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 24 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 25 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 26 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 27 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidechains: packing residue number 28 because of missing atom number 5 atom name  CB
core.pack.pack_missing_sidec

(38.72030837315116,
 <pyrosetta.rosetta.core.pose.Pose at 0x71d0f379a230>,
 -289.54999356514287)