In [1]:
import os
import json
from pydantic import BaseModel
from crewai.tools.structured_tool import CrewStructuredTool
from crewai import Agent, Task, Crew, LLM
import requests
import urllib.parse
from pydantic import BaseModel, Field
from typing import List, Literal
import tools.alpha_fold_fetch as alpha_fold_fetch
from tools.Query_format import QueryItem, APIQuery


In [2]:


llm = LLM(
    model="groq/gemma2-9b-it",
    temperature=0.7
)
os.environ["GEMINI_API_KEY"] = "AIzaSyDQSOPPaW8BVWXny1ycBfO_tF9jJiYiuag"
os.environ["MEM0_API_KEY"] = "m0-3wFDlHPJEB4GMP6HKSsHpGlm3Kt1s6xmvYQi5IuP"
gemini_llm = LLM(
    model="gemini/gemini-2.0-flash",
    temperature=0.7
)

In [3]:

rf_diff_context_path = r"config/RF_diff_context.txt"
with open(rf_diff_context_path, 'r') as f:
    RF_Dif_manual = f.read()

In [4]:
from storage_manager import StorageManager


storage = StorageManager()

In [5]:
from pathlib import Path

# Get current path
current_path = str(Path.cwd())+"\\"+str(storage.get_session_path())
print(current_path)

d:\PROJECT\PROTEIN_MAKER\Protein-Designing-With-Agents\cache\session_20250406-112148_1d5326ba


TOOLS

In [6]:
from crewai.tools import tool
from tools.uniprot_extended_tool import toolset
import json
import requests

crewtool = toolset(current_path)

@tool("uniprot_fetch_tool")
def uniprot_fetch_tool(query: str) -> str:
    """This tool provides an acces to fetch data from UniProtKB using the UniProt REST API.
    the input has to be strictly a string """
    return crewtool.uniprot_fetch_tool(query=query)


@tool("get_protein__site_info")
def get_protein__site_info(protein_id: str) -> str:
    """Fetches the protein site information from cache."""
    return crewtool.get_protein__site_info(protein_id=protein_id)


AGENTS

In [7]:

query_generator = Agent(
    role="uniprot_query_generator",
    goal="Generates a UniProt query from a given protein function: {userinput}. Ensure the query retrieves relevant proteins",
    backstory="Designed as a highly specialized bioinformatics assistant to construct precise UniProt queries.",
    
    verbose=True,
    llm=gemini_llm
)

uniprot_query_assurance_agent = Agent(
    role="query_assurance_agent",
    goal="Ensures the generated UniProt query is accurate and relevant to the protein function: {userinput}. Verify that the query retrieves the correct proteins and aligns with the intended function."
          "RUN the uniprot_fetch_tool, if it doesn't work then the query is suboptimal, provide suggestions to improve it."
          "uniprot_fetch_tool only works with a parathesis query in string format"
          "Query rules:"
          "-the query has to be string with paranthesis and semicolon and characters no other special characters allowed"
          "-avoid metioning \"query\" as it is unnecessary",
    backstory="This agent acts as a quality control specialist for bioinformatics queries, ensuring that the query targets the right proteins.",
    tools=[uniprot_fetch_tool],
    verbose=True,
    llm=gemini_llm,
)

protein_expert_agent = Agent(
    role="Protein Scaffolding Specialist",
    goal="Assist in creating protein scaffolds by identifying which motifs should be masked or preserved based on UniProt features.",
    backstory="Expert in computational protein design with experience in analyzing protein structural and functional data.",
    llm=gemini_llm,
    tool=[get_protein__site_info],
    tools_verbose=True,
    verbose=True
)
RF_Diffusion_Expert = Agent(
    role="RF Diffusion Expert",
    goal=("Translate the protein scaffolding requirements into specific RF Diffusion implementation strategies, "
          "specifying which motifs to mask versus preserve and providing technical details."),
    backstory="Expert in diffusion-based generative modeling for protein design. Manual context: {RF_Dif_manual}",
    llm=gemini_llm,
    verbose=True
)




TASKS

In [8]:
# Define tasks for the UniProt query workflow
plan = Task(
    description=(
        "1. Extract key biological terms from the protein function description: {userinput}.\n"
        "2. Map these terms to UniProt search fields and controlled vocabularies.\n"
        "3. Generate a string within paranthesis UniProt query optimized for accuracy and recall.\n"
        "4. Validate and refine the query to ensure relevant search results."
    ),
    expected_output="UniProt query for the given protein function description",
    agent=query_generator
)

query_review = Task(
    description="Review the UniProt query generated by the uniprot_query_generator agent.",
    expected_input="A UniProt query string generated from a protein function description, along with the original user input.",
    expected_output=(
                     "If the query is suboptimal, provide suggestions to improve it."
                     "After a successful query return the protein ID"),
    steps=[
        "Receive the generated UniProt query and the original protein function description.",
        "Submit the query to the UniProt database using uniprot_tool.",
        "Analyze the retrieved proteins and compare their functions to the intended protein function.",
        "resolve the query using the uniprot_fetch_tool unitil the problem is fixed",
        "Approve the query if it runs successfully and retrieves relevant proteins.",
    ],
    acceptance_criteria=[
        "The query retrieves proteins that strongly match the intended function.",
        "The query does not produce irrelevant or overly broad results.",
        "Suggestions for improvement are practical and enhance query precision.",
        "The validation report clearly explains the decision."
    ],
    agent=uniprot_query_assurance_agent
)
protein_analysis_task = Task(
    description=("Analyze the provided UniProt features in JSON format, identifying all functional sites (active sites, binding sites, metal-binding sites, etc.). "
                 "Interpret the user's desired protein function described in natural language and determine which motifs should be preserved as anchors and which regions masked for redesign. "
                 "Identify the most appropriate scaffolding approach and provide clear reasoning for your decisions, including motif positions. "
                 "unction to be preserved: {function}. use the protein id from previous task"),
    expected_output=("A detailed JSON report containing an analysis of UniProt features, the identified motifs to be preserved or masked with justification, "
                     "and the recommended scaffolding approach."),
    agent=protein_expert_agent,
    #context=[protein_analysis_task]
)

# Task for generating the RF Diffusion configuration script based on the protein analysis
RF_Diffusion_configuration_task = Task(
    description=("Using the analysis from the previous task, generate a configuration script for running the RF Diffusion process. "
                 "Include model settings, diffusion parameters, input file paths, output directory locations, and any optional parameters (such as contigmap)."),
    expected_output="A valid terminal script containing the RF Diffusion configuration with all necessary parameters.",
    agent=RF_Diffusion_Expert,
    context=[protein_analysis_task]
)


CREW

In [9]:
combined_crew = Crew(
    agents=[query_generator, uniprot_query_assurance_agent, protein_expert_agent, RF_Diffusion_Expert],
    tasks=[plan, query_review, protein_analysis_task, RF_Diffusion_configuration_task],
    verbose=True,
)

RUN

In [10]:
combined_inputs = {
    "userinput": "I want a protein that can help in the DNA binding.",
    "session": "1",
    "function": "I want a protein that can help in the DNA binding.",
    "Base protein": "P49593",
    "RF_Dif_manual":RF_Dif_manual
}

result = combined_crew.kickoff(inputs=combined_inputs)
print(result)

[1m[95m# Agent:[00m [1m[92muniprot_query_generator[00m
[95m## Task:[00m [92m1. Extract key biological terms from the protein function description: I want a protein that can help in the DNA binding..
2. Map these terms to UniProt search fields and controlled vocabularies.
3. Generate a string within paranthesis UniProt query optimized for accuracy and recall.
4. Validate and refine the query to ensure relevant search results.[00m




[1m[95m# Agent:[00m [1m[92muniprot_query_generator[00m
[95m## Final Answer:[00m [92m
(go:"DNA binding" OR keyword:"DNA binding")[00m




[1m[95m# Agent:[00m [1m[92mquery_assurance_agent[00m
[95m## Task:[00m [92mReview the UniProt query generated by the uniprot_query_generator agent.[00m


https://rest.uniprot.org/uniprotkb/search?query=%28go%3A%22DNA+binding%22+OR+keyword%3A%22DNA+binding%22%29&format=json&size=4
Data saved to d:\PROJECT\PROTEIN_MAKER\Protein-Designing-With-Agents\cache\session_20250406-112148_1d5326ba/uniprot/P07199.json
Data saved to d:\PROJECT\PROTEIN_MAKER\Protein-Designing-With-Agents\cache\session_20250406-112148_1d5326ba/uniprot/Q96BR9.json
Data saved to d:\PROJECT\PROTEIN_MAKER\Protein-Designing-With-Agents\cache\session_20250406-112148_1d5326ba/uniprot/O14628.json
Data saved to d:\PROJECT\PROTEIN_MAKER\Protein-Designing-With-Agents\cache\session_20250406-112148_1d5326ba/uniprot/P17022.json
P07199
PDB file downloaded successfully.
Q96BR9
PDB file downloaded successfully.
O14628
PDB file downloaded successfully.
P17022
PDB file downloaded successfully.




[1m[95m# Agent:[00m [1m[92mquery_assurance_agent[00m
[95m## Thought:[00m [92mThe goal is to ensure the UniProt query targets the right proteins, specifically those involved in DNA binding. The provided query "(go:"DNA binding" OR keyword:"DNA binding")" seems reasonable as a starting point, but it's important to test its effectiveness and suggest improvements if necessary. I will use the uniprot_fetch_tool to check if the query returns relevant proteins.[00m
[95m## Using tool:[00m [92muniprot_fetch_tool[00m
[95m## Tool Input:[00m [92m
"{\"query\": \"(go:\\\"DNA binding\\\" OR keyword:\\\"DNA binding\\\")\"}"[00m
[95m## Tool Output:[00m [92m
['P07199', 'Q96BR9', 'O14628', 'P17022'][00m




[1m[95m# Agent:[00m [1m[92mquery_assurance_agent[00m
[95m## Final Answer:[00m [92m
P07199[00m




[1m[95m# Agent:[00m [1m[92mProtein Scaffolding Specialist[00m
[95m## Task:[00m [92mAnalyze the provided UniProt features in JSON format, identifying all functional sites (active sites, binding sites, metal-binding sites, etc.). Interpret the user's desired protein function described in natural language and determine which motifs should be preserved as anchors and which regions masked for redesign. Identify the most appropriate scaffolding approach and provide clear reasoning for your decisions, including motif positions. unction to be preserved: I want a protein that can help in the DNA binding.. use the protein id from previous task[00m




[1m[95m# Agent:[00m [1m[92mProtein Scaffolding Specialist[00m
[95m## Final Answer:[00m [92m
```json
{
  "protein_id": "P07199",
  "protein_name": "Transcription factor AP-1",
  "desired_function": "DNA binding",
  "uniprot_features_analysis": {
    "dna_binding_related_features": [
      {
        "feature_type": "DNA_BIND",
        "feature_description": "DNA binding region",
        "positions": "250-277",
        "justification": "This region is directly involved in DNA binding and is crucial for the desired function. It contains the basic region leucine zipper (bZIP) domain responsible for DNA interaction."
      },
      {
        "feature_type": "DOMAIN",
        "feature_description": "bZIP domain",
        "positions": "250-310",
        "justification": "The bZIP domain is essential for both DNA binding and dimerization. Preserving the entire domain ensures proper folding and function."
      },
        {
        "feature_type": "REGION",
        "feature_descriptio

[1m[95m# Agent:[00m [1m[92mRF Diffusion Expert[00m
[95m## Task:[00m [92mUsing the analysis from the previous task, generate a configuration script for running the RF Diffusion process. Include model settings, diffusion parameters, input file paths, output directory locations, and any optional parameters (such as contigmap).[00m




[1m[95m# Agent:[00m [1m[92mRF Diffusion Expert[00m
[95m## Final Answer:[00m [92m
```bash
#!/bin/bash

# RFdiffusion configuration script

# --- Parameters ---
output_prefix="rfdiffusion_outputs/AP1_design"
num_designs=10
input_pdb="path/to/your/scaffold.pdb"  # Replace with the path to your scaffold PDB file
# If no scaffold is used, comment out the line above and the 'inference.input_pdb' argument below.

# Contig specifications
# Preserving bZIP domain (250-310) and flanking it with variable length regions.
# Assuming the scaffold PDB, if used, has chain ID 'A'. Adjust if necessary.
contigs="[1-249/A250-310]" # N-terminal is variable, bZIP domain is fixed

# Sequence masking
# Masking the N-terminal region (1-249) to allow RFdiffusion to redesign it.
# This assumes the input PDB, if used, has chain ID 'A'.
inpaint_seq="[A1-249]"

# Partial diffusion (optional)
# Use partial diffusion to diversify around the existing scaffold structure.
# Adjust the value of partial_T based

```bash
#!/bin/bash

# RFdiffusion configuration script

# --- Parameters ---
output_prefix="rfdiffusion_outputs/AP1_design"
num_designs=10
input_pdb="path/to/your/scaffold.pdb"  # Replace with the path to your scaffold PDB file
# If no scaffold is used, comment out the line above and the 'inference.input_pdb' argument below.

# Contig specifications
# Preserving bZIP domain (250-310) and flanking it with variable length regions.
# Assuming the scaffold PDB, if used, has chain ID 'A'. Adjust if necessary.
contigs="[1-249/A250-310]" # N-terminal is variable, bZIP domain is fixed

# Sequence masking
# Masking the N-terminal region (1-249) to allow RFdiffusion to redesign it.
# This assumes the input PDB, if used, has chain ID 'A'.
inpaint_seq="[A1-249]"

# Partial diffusion (optional)
# Use partial diffusion to diversify around the existing scaffold structure.
# Adjust the value of partial_T based on the desired level of diversity.
partial_T=20

# --- RFdiffusion command ---
./scripts/ru