In [49]:
import os
from crewai import Agent, Crew, Task, Process
from crewai.project import CrewBase, agent, task, crew, before_kickoff, after_kickoff
import tools.alpha_fold_fetch as alpha_fold_fetch
from crewai.tools import tool
from tools.alpha_fold_fetch import fetch_from_alphafolddb
import json
import requests
from crewai import LLM
from typing import Type
from crewai.tools import BaseTool
from pydantic import BaseModel, Field

In [50]:



dir = r"/home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents"

In [51]:
os.environ["CREWAI_DISABLE_TELEMETRY"] = "true"
GROQ_API_KEY = "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"
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

llm = LLM(
    model="groq/gemma2-9b-it",
    temperature=0.7
)
os.environ["GEMINI_API_KEY"] = "AIzaSyDQSOPPaW8BVWXny1ycBfO_tF9jJiYiuag"
gemini_llm = LLM(
    model="gemini/gemini-2.0-flash",
    temperature=0.7
)

In [52]:
# Load RF Diffusion manual context from file
rf_diff_context_path = r"config/RF_diff_context.txt"
with open(rf_diff_context_path, 'r') as f:
    RF_Dif_manual = f.read()

In [53]:
class uniprot_fetch_toolInput(BaseModel):
    """Input schema for uniprot_fetch_tool."""
    argument: str = Field(..., description="Uniprot query")

class ProteinIdInput(BaseModel):
    """Input schema for get_protein__site_info."""
    argument: str = Field(..., description="protein_id")

class uniprot_fetch_tool(BaseTool):
    name: str = "uniprot_fetch_tool"
    description: str = "This tool provides an acces to fetch data from UniProtKB using the UniProt REST API"
    args_schema: Type[BaseModel] = uniprot_fetch_toolInput

    def _run(self, argument: str) -> str:
        url = "https://rest.uniprot.org/uniprotkb/search"
        params = {
            "query": query,
            "format": "json",
            "size": 1 #adjust as needed.
        }

        try:
            response = requests.get(url, params=params)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            data = response.json()
            filename = f"cache/uniprot/{data['results'][0]['primaryAccession'].replace(' ', '_')}.json"
            with open(filename, "w") as f:
                json.dump(data, f, indent=4)
            print(f"Data saved to {filename}")
            try:
                apha_f_id =0
                for i in data['results'][0]['uniProtKBCrossReferences']:
                    if i.get('database') == 'AlphaFoldDB':
                        apha_f_id = i.get('id')
                        break
                print(apha_f_id)
                alpha_fold_fetch.fetch_from_alphafolddb(apha_f_id)
            except: 
                print("No AlphaFoldDB entry found for the given entry ID.")
                return "No AlphaFoldDB entry found for the given entry ID.,provide a query with take protein with structure"
            return data['results'][0]['primaryAccession']

        except requests.exceptions.RequestException as e:
            print(f"Error during UniProt API request: {e}")
            return None
        except ValueError as e: # Catch JSON decoding errors.
            print(f"Error decoding JSON: {e}")
            return None

class get_protein__site_info(BaseTool):
    name: str = "get_protein__site_info"
    description: str = "Fetches the protein site information from cache."
    args_schema: Type[BaseModel] = ProteinIdInput

    def _run(self, argument: str) -> str:
        loc = r"cache/uniprot/{protein_id}.json"
        with open(loc, 'r') as f:
            data = json.load(f)
            print("used")
            return str(data['results'][0]['features'])
        print("No data found")
        return None

In [59]:

class protein_design_crew():
    """Research crew for comprehensive topic analysis and reporting"""

    
    uniprot_fetch_tool = uniprot_fetch_tool()

    def query_generator(self)->Agent:
        return Agent(
            role="uniprot_query_generator",
            goal="Generates a UniProt query from a given protein function: {userinput}. Ensure the query retrieves relevant proteins",
            backstory="Designed as a highly specialized bioinformatics assistant to construct precise UniProt queries.",
            
            verbose=True,
            llm=gemini_llm
        )

    def uniprot_query_assurance_agent(self)->Agent:
        return Agent(
            role="query_assurance_agent",
            goal="Ensures the generated UniProt query is accurate and relevant to the protein function: {userinput}. Verify that the query retrieves the correct proteins and aligns with the intended function."
                "RUN the uniprot_fetch_tool, if it doesn't work then the query is suboptimal, provide suggestions to improve it."
                "uniprot_fetch_tool only works with a parathesis query in string format"
                "Query rules:"
                "-the query has to be string with paranthesis and semicolon and characters no other special characters allowed"
                "-avoid metioning \"query\" as it is unnecessary",
            backstory="This agent acts as a quality control specialist for bioinformatics queries, ensuring that the query targets the right proteins.",
            tools=[uniprot_fetch_tool],
            verbose=True,
            llm=gemini_llm,
        )

    def protein_expert_agent(self)->Agent:
        return Agent(
            role="Protein Scaffolding Specialist",
            goal="Assist in creating protein scaffolds by identifying which motifs should be masked or preserved based on UniProt features.",
            backstory="Expert in computational protein design with experience in analyzing protein structural and functional data.",
            llm=gemini_llm,
            tool=[get_protein__site_info],
            tools_verbose=True,
            verbose=True
        )

    def RF_Diffusion_Expert(self)->Agent:
        return  Agent(
        role="RF Diffusion Expert",
        goal=("Translate the protein scaffolding requirements into specific RF Diffusion implementation strategies, "
            "specifying which motifs to mask versus preserve and providing technical details."),
        backstory="Expert in diffusion-based generative modeling for protein design. Manual context: {RF_Dif_manual}",
        llm=gemini_llm,
        verbose=True
        )

    def plan(self)->Task:
        return Task(
            description=(
                "1. Extract key biological terms from the protein function description: {userinput}.\n"
                "2. Map these terms to UniProt search fields and controlled vocabularies.\n"
                "3. Generate a string within paranthesis UniProt query optimized for accuracy and recall.\n"
                "4. Validate and refine the query to ensure relevant search results."
            ),
            expected_output="UniProt query for the given protein function description",
            agent=self.query_generator()
        )
    

    def query_review(self)->Task:
        return Task(
            description="Review the UniProt query generated by the uniprot_query_generator agent.",
            expected_input="A UniProt query string generated from a protein function description, along with the original user input.",
            expected_output=(
                            "If the query is suboptimal, provide suggestions to improve it."
                            "After a successful query return the protein ID"),
            steps=[
                "Receive the generated UniProt query and the original protein function description.",
                "Submit the query to the UniProt database using uniprot_tool.",
                "Analyze the retrieved proteins and compare their functions to the intended protein function.",
                "resolve the query using the uniprot_fetch_tool unitil the problem is fixed",
                "Approve the query if it runs successfully and retrieves relevant proteins.",
            ],
            acceptance_criteria=[
                "The query retrieves proteins that strongly match the intended function.",
                "The query does not produce irrelevant or overly broad results.",
                "Suggestions for improvement are practical and enhance query precision.",
                "The validation report clearly explains the decision."
            ],
            agent=self.uniprot_query_assurance_agent()
        )
    

    def protein_analysis_task(self)->Task:
        return Task(
            description=("Analyze the provided UniProt features in JSON format, identifying all functional sites (active sites, binding sites, metal-binding sites, etc.). "
                        "Interpret the user's desired protein function described in natural language and determine which motifs should be preserved as anchors and which regions masked for redesign. "
                        "Identify the most appropriate scaffolding approach and provide clear reasoning for your decisions, including motif positions. "
                        "unction to be preserved: {function}. use the protein id from previous task"),
            expected_output=("A detailed JSON report containing an analysis of UniProt features, the identified motifs to be preserved or masked with justification, "
                            "and the recommended scaffolding approach."),
            agent=self.protein_expert_agent(),
            #context=[protein_analysis_task]
        )
    

    def RF_Diffusion_configuration_task(self)->Task:
        return Task(
            description=("Using the analysis from the previous task, generate a configuration script for running the RF Diffusion process. "
                        "Include model settings, diffusion parameters, input file paths, output directory locations, and any optional parameters (such as contigmap)."),
            expected_output="A valid terminal script containing the RF Diffusion configuration with all necessary parameters.",
            agent=self.RF_Diffusion_Expert(),
            context=[self.protein_analysis_task()]
        )
    

    def crew(self)->Crew:
        """A specialized crew for protein design, focusing on scaffolding and RF Diffusion."""
        return Crew(
            agents=[
                self.query_generator(),
                self.uniprot_query_assurance_agent(),
                self.protein_expert_agent(),
                self.RF_Diffusion_Expert()
            ],
            tasks=[
                self.plan(),
                self.query_review(),
                self.protein_analysis_task(),
                self.RF_Diffusion_configuration_task()
            ],
            verbose=True
        )





In [55]:
dir = r"/home/bharath-sooryaa-m/Documents/BIO/proj/Protein-Designing-With-Agents"






In [56]:

'''query_generator = Agent(
    role="uniprot_query_generator",
    goal="Generates a UniProt query from a given protein function: {userinput}. Ensure the query retrieves relevant proteins",
    backstory="Designed as a highly specialized bioinformatics assistant to construct precise UniProt queries.",
    
    verbose=True,
    llm=gemini_llm
)


uniprot_query_assurance_agent = Agent(
    role="query_assurance_agent",
    goal="Ensures the generated UniProt query is accurate and relevant to the protein function: {userinput}. Verify that the query retrieves the correct proteins and aligns with the intended function."
          "RUN the uniprot_fetch_tool, if it doesn't work then the query is suboptimal, provide suggestions to improve it."
          "uniprot_fetch_tool only works with a parathesis query in string format"
          "Query rules:"
          "-the query has to be string with paranthesis and semicolon and characters no other special characters allowed"
          "-avoid metioning \"query\" as it is unnecessary",
    backstory="This agent acts as a quality control specialist for bioinformatics queries, ensuring that the query targets the right proteins.",
    tools=[uniprot_fetch_tool],
    verbose=True,
    llm=gemini_llm,
)
'''

'''plan = Task(
    description=(
        "1. Extract key biological terms from the protein function description: {userinput}.\n"
        "2. Map these terms to UniProt search fields and controlled vocabularies.\n"
        "3. Generate a string within paranthesis UniProt query optimized for accuracy and recall.\n"
        "4. Validate and refine the query to ensure relevant search results."
    ),
    expected_output="UniProt query for the given protein function description",
    agent=query_generator
)'''

'''query_review = Task(
    description="Review the UniProt query generated by the uniprot_query_generator agent.",
    expected_input="A UniProt query string generated from a protein function description, along with the original user input.",
    expected_output=(
                     "If the query is suboptimal, provide suggestions to improve it."
                     "After a successful query return the protein ID"),
    steps=[
        "Receive the generated UniProt query and the original protein function description.",
        "Submit the query to the UniProt database using uniprot_tool.",
        "Analyze the retrieved proteins and compare their functions to the intended protein function.",
        "resolve the query using the uniprot_fetch_tool unitil the problem is fixed",
        "Approve the query if it runs successfully and retrieves relevant proteins.",
    ],
    acceptance_criteria=[
        "The query retrieves proteins that strongly match the intended function.",
        "The query does not produce irrelevant or overly broad results.",
        "Suggestions for improvement are practical and enhance query precision.",
        "The validation report clearly explains the decision."
    ],
    agent=uniprot_query_assurance_agent
)'''

'''protein_expert_agent = Agent(
    role="Protein Scaffolding Specialist",
    goal="Assist in creating protein scaffolds by identifying which motifs should be masked or preserved based on UniProt features.",
    backstory="Expert in computational protein design with experience in analyzing protein structural and functional data.",
    llm=gemini_llm,
    tool=[get_protein__site_info],
    tools_verbose=True,
    verbose=True
)

RF_Diffusion_Expert = Agent(
    role="RF Diffusion Expert",
    goal=("Translate the protein scaffolding requirements into specific RF Diffusion implementation strategies, "
          "specifying which motifs to mask versus preserve and providing technical details."),
    backstory="Expert in diffusion-based generative modeling for protein design. Manual context: {RF_Dif_manual}",
    llm=gemini_llm,
    verbose=True
)'''

'''protein_analysis_task = Task(
    description=("Analyze the provided UniProt features in JSON format, identifying all functional sites (active sites, binding sites, metal-binding sites, etc.). "
                 "Interpret the user's desired protein function described in natural language and determine which motifs should be preserved as anchors and which regions masked for redesign. "
                 "Identify the most appropriate scaffolding approach and provide clear reasoning for your decisions, including motif positions. "
                 "unction to be preserved: {function}. use the protein id from previous task"),
    expected_output=("A detailed JSON report containing an analysis of UniProt features, the identified motifs to be preserved or masked with justification, "
                     "and the recommended scaffolding approach."),
    agent=protein_expert_agent,
    #context=[protein_analysis_task]
)'''

'''RF_Diffusion_configuration_task = Task(
    description=("Using the analysis from the previous task, generate a configuration script for running the RF Diffusion process. "
                 "Include model settings, diffusion parameters, input file paths, output directory locations, and any optional parameters (such as contigmap)."),
    expected_output="A valid terminal script containing the RF Diffusion configuration with all necessary parameters.",
    agent=RF_Diffusion_Expert,
    context=[protein_analysis_task]
)
'''







'RF_Diffusion_configuration_task = Task(\n    description=("Using the analysis from the previous task, generate a configuration script for running the RF Diffusion process. "\n                 "Include model settings, diffusion parameters, input file paths, output directory locations, and any optional parameters (such as contigmap)."),\n    expected_output="A valid terminal script containing the RF Diffusion configuration with all necessary parameters.",\n    agent=RF_Diffusion_Expert,\n    context=[protein_analysis_task]\n)\n'

In [60]:
def run():
    """
    Run the crew.
    """
    combined_inputs = {
        "userinput": "I want a protein that can help in the DNA binding.",
        "session": "1",
        "function": "I want a protein that can help in the DNA binding.",
        "Base protein": "P49593",
        "RF_Dif_manual":RF_Dif_manual
    }

    result = protein_design_crew().crew().kickoff(inputs= combined_inputs)
    print(result)

In [61]:
if __name__ == "__main__":
    run()

ValidationError: 1 validation error for Agent
tools.0
  Input should be a valid dictionary or instance of BaseTool [type=model_type, input_value=<class '__main__.uniprot_fetch_tool'>, input_type=ModelMetaclass]
    For further information visit https://errors.pydantic.dev/2.9/v/model_type

Gemini ONLY