In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Gene Variant Knowledge Agent

**Overview:**  
This agent helps students and researchers quickly retrieve and summarize scientific information about gene variants. Users input a variant (e.g., BRCA1 c.5266dupC) and the agent provides:

- Mutation type (frameshift, missense, etc.)
- Affected protein region
- Key research summary


# Problem Statement

Researching gene variants is slow and manual. Students and researchers often spend a lot of time finding scientific information about variants like BRCA1 c.5266dupC or TP53 p.Arg175His. This agent automates the process, summarizing variant details efficiently.


# Solution Approach

The Gene Variant Knowledge Agent uses a **multi-agent system**:

1. **Fetcher Agent**: Retrieves raw variant data from databases (mock or real database can be used)  
2. **Summarizer Agent**: Converts raw data into readable summaries.  
3. **Memory Agent**: Stores previous queries for session continuity.  

A **custom tool** validates variant inputs, and all previous queries are stored in memory for follow-ups.


For now, the agent uses a mock variant database to demonstrate the workflow end-to-end.
However, the design can be easily extended to use real data sources such as ClinVar, NCBI Gene, or Ensembl APIs

In [2]:
# For data handling
import requests
import json

# For memory/session storage
from collections import deque

# Optional: for summaries with LLMs (replace with Kaggle-supported LLM if needed)
# from openai import OpenAI

# For logging
import logging
logging.basicConfig(level=logging.INFO)


In [3]:
def parse_variant(variant):
    """
    Checks if the variant input is in correct format: Gene + c./p. notation
    Returns True if valid, else False
    """
    import re
    pattern = r"^[A-Za-z0-9]+ (c|p)\.[A-Za-z0-9]+$"
    if re.match(pattern, variant):
        return True
    else:
        return False

# Example usage
print(parse_variant("BRCA1 c.5266dupC"))  # True
print(parse_variant("TP53 p.Arg175His"))  # True
print(parse_variant("BRCA1 5266dupC"))    # False


True
True
False


In [4]:
# Store last 5 queries and results
session_memory = deque(maxlen=5)

def add_to_memory(variant, result):
    session_memory.append({"variant": variant, "result": result})

def get_previous_queries():
    return list(session_memory)


In [5]:
def fetch_variant_info(variant):
    """
    Simulates fetching variant info from ClinVar / NCBI
    Returns a dictionary with mutation type, protein region, and research summary
    """
    # Example mock data
    database = {
        "BRCA1 c.5266dupC": {
            "mutation_type": "Frameshift",
            "protein_region": "BRCT domain",
            "research_summary": "Found in multiple studies, associated with increased breast cancer risk"
        },
        "TP53 p.Arg175His": {
            "mutation_type": "Missense",
            "protein_region": "DNA-binding domain",
            "research_summary": "Common pathogenic variant affecting tumor suppression"
        }
    }
    return database.get(variant, {"mutation_type": "Unknown", "protein_region": "Unknown", "research_summary": "No data found"})


In [6]:
def summarize_variant_info(variant_info):
    """
    Converts raw variant info into a clear, human-readable summary
    """
    summary = (
        f"Mutation Type: {variant_info['mutation_type']}\n"
        f"Affected Protein Region: {variant_info['protein_region']}\n"
        f"Research Summary: {variant_info['research_summary']}"
    )
    return summary


In [7]:
def gene_variant_agent(variant):
    if not parse_variant(variant):
        return "Invalid variant format. Use Gene c./p. notation."
    
    info = fetch_variant_info(variant)
    summary = summarize_variant_info(info)
    
    add_to_memory(variant, summary)
    
    logging.info(f"Variant processed: {variant}")
    return summary


In [8]:
# Test the agent
print(gene_variant_agent("BRCA1 c.5266dupC"))
print(gene_variant_agent("TP53 p.Arg175His"))

# Show previous queries
print("\nPrevious Queries:")
print(get_previous_queries())


INFO:root:Variant processed: BRCA1 c.5266dupC
INFO:root:Variant processed: TP53 p.Arg175His


Mutation Type: Frameshift
Affected Protein Region: BRCT domain
Research Summary: Found in multiple studies, associated with increased breast cancer risk
Mutation Type: Missense
Affected Protein Region: DNA-binding domain
Research Summary: Common pathogenic variant affecting tumor suppression

Previous Queries:
[{'variant': 'BRCA1 c.5266dupC', 'result': 'Mutation Type: Frameshift\nAffected Protein Region: BRCT domain\nResearch Summary: Found in multiple studies, associated with increased breast cancer risk'}, {'variant': 'TP53 p.Arg175His', 'result': 'Mutation Type: Missense\nAffected Protein Region: DNA-binding domain\nResearch Summary: Common pathogenic variant affecting tumor suppression'}]


# Agent Workflow


User Input → Parse Tool → Fetcher Agent → Summarizer Agent → Memory → Output


# Conclusion

The Gene Variant Knowledge Agent demonstrates a **multi-agent system** with tools and session memory.  
It efficiently summarizes gene variant information for students and researchers. 
