# **FraudSleuth Gen AI**

## 1. Setup and Imports

In [3]:
# Environment
import os
from dotenv import load_dotenv
load_dotenv()

# LLM + LangChain
from google import genai
from google.genai import types
from google.api_core import retry
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain
from langchain_core.runnables import Runnable
from langchain_core.prompts import PromptTemplate
#from langchain.agents import initialize_agent, Tool, AgentType
#from langchain.memory import ConversationBufferMemory
from langchain_google_genai.chat_models import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, END

# ChromaDB
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings

# Requests for Fraud API
import requests

import re

# MLFLow
import mlflow

## 2. Config

In [4]:
# Example .env usage (you can inline if needed)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
FRAUD_API_KEY = os.getenv("FRAUD_API_KEY")

GENAI_MODEL="gemini-2.0-flash"
EMBEDDING_MODEL = "models/embedding-001"
CHROMA_COLLECTION_NAME = "fraud_docs"

FRAUD_API_URL = "https://ipqualityscore.com/api/json"


In [5]:
# Authenticate Gemini
client = genai.Client(api_key = GEMINI_API_KEY)

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

## 3. Embedding Function

In [6]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    def __init__(self, document_mode=True):
        self.document_mode = document_mode

    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        
        response = client.models.embed_content(
                                    model = EMBEDDING_MODEL,
                                    contents = input,
                                    config = types.EmbedContentConfig(
                                        task_type = embedding_task,
                                    )
                                )
        
        # Return list of embedding vectors
        return [e.values for e in response.embeddings]

## 4. Chroma Vector Search

In [7]:
class ChromaVectorSearch:

    def __init__(self, persist_directory="chroma_db", collection_name = CHROMA_COLLECTION_NAME):
        
        self.persist_directory = persist_directory
        self.collection_name = collection_name
        self.client = chromadb.PersistentClient(path = self.persist_directory)
        self.embed_fn = GeminiEmbeddingFunction(document_mode = True)
    
        # Create or get collection
        self.collection = self.client.get_or_create_collection(name = self.collection_name,
                                                               embedding_function = self.embed_fn)
        
        self.load_documents_from_file("../data/knowledge_base.txt")

    def load_documents_from_file(self, file_path: str):
        if os.path.exists(file_path):
            with open(file_path, "r") as f:
                docs = [line.strip() for line in f if line.strip()]
                self.add_documents(docs)
                print(f"✅ Loaded {len(docs)} documents from {file_path} into ChromaDB.")
        else:
            print(f"⚠️ File not found: {file_path}")

    def add_documents(self, documents: list[str], ids: list[str] = None):
        """
        Adds documents to ChromaDB with embeddings.
        """
        if not documents:
            print("⚠️ No documents to add.")
            return
        if ids is None:
            ids = [str(i) for i in range(len(documents))]
        self.collection.add(documents = documents, ids = ids)

    def query(self, text: str, n_results: int = 5):
        """
        Queries the collection for top matching documents.
        """

        results = self.collection.query(query_texts=[text], n_results = n_results)
        return results

    def reset_collection(self):
        self.client.delete_collection(self.collection_name)
        self.collection = self.client.get_or_create_collection(name = self.collection_name, 
                                                               embedding_function = self.embed_fn)

## 5. Fraud Detection API Tool

In [8]:
class FraudChecker:
    def __init__(self):
        self.api_key = FRAUD_API_KEY
        self.api_url = FRAUD_API_URL

    def check_ip(self, ip: str) -> str:
        try:
            url = f"{self.api_url}/ip/{self.api_key}/{ip}"
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()
                return f"IP: {ip}," \
                       f"Fraud Score: {data.get('fraud_score')}, VPN: {data.get('vpn')},"\
                       f" Proxy: {data.get('proxy')}, Tor: {data.get('tor')},"\
                       f" Crawler: {data.get('crawler')}, Recent Abuse: {data.get('recent_abuse')},"\
                       f" Bot: {data.get('is_bot')}"
            else:
                return f"Error from API: {response.status_code}"
        except Exception as e:
            return f"Exception during fraud check: {str(e)}"
        
    def check_email(self, email: str) -> str:
        try:
            url = f"{self.api_url}/email/{self.api_key}/{email}"
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()
                return f"Email: {email}," \
                       f"Fraud Score: {data.get('fraud_score', 'N/A')}, Valid: {data.get('valid')},"\
                       f" Disposable: {data.get('disposable')}, Recent Absue: {data.get('recent_abuse')}"
            else:
                return f"Error from API: {response.status_code}"
        except Exception as e:
            return f"Exception during fraud check: {str(e)}"

## 6. LangChain Prompt Template and Model Setup

In [9]:
# Prompt template to synthesize context and external API result
prompt_template = PromptTemplate(
    input_variables=["query", "retrieved_docs", "fraud_api_result"],
    template="""
        You are a fraud detection assistant.

        User Query:
        {query}

        Retrieved Knowledge Context:
        {retrieved_docs}

        Fraud Check Result (if applicable):
        {fraud_api_result}

        Based on all of the above, provide a clear and actionable response to the user.
        """
        )

# Set up LangChain Gemini Chat model
llm = ChatGoogleGenerativeAI(model = GENAI_MODEL, temperature = 0.2, google_api_key = GEMINI_API_KEY)
llm_chain: Runnable = prompt_template | llm

In [10]:
# Define state
from typing import TypedDict

class FraudDetectionState(TypedDict):
    query: str
    retrieved_docs: str
    fraud_api_result: str
    response: str

## 7. Tool Definition and Agent

In [11]:
# Initialize tools
fraud_checker = FraudChecker()
vector_search = ChromaVectorSearch()

✅ Loaded 5 documents from ../data/knowledge_base.txt into ChromaDB.


In [12]:
def extract_ip_or_email(query: str):
    """Basic regex utility to extract IPs or emails from a query."""
    ip_match = re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', query)
    email_match = re.search(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', query)
    return ip_match.group(0) if ip_match else email_match.group(0) if email_match else None

In [13]:
# Vector Search
def retrieve_docs(state: FraudDetectionState) -> FraudDetectionState:
    print("🔧 Tool: Vector Search called...")
    results = vector_search.query(state["query"])
    docs = "\n".join(results["documents"][0])
    return {**state, "retrieved_docs": docs}

# Fraud API Check
def check_fraud_api(state: FraudDetectionState) -> FraudDetectionState:
    print("🔧 Tool: Check Fraud API called...")
    suspicious_input = extract_ip_or_email(state["query"])
    if suspicious_input:
        if "@" in suspicious_input:
            result = fraud_checker.check_email(suspicious_input)
        else:
            result = fraud_checker.check_ip(suspicious_input)
    else:
        result = "No IP or email detected in query."
    return {**state, "fraud_api_result": result}

# Final LLM synthesis
def generate_response(state: FraudDetectionState) -> FraudDetectionState:
    print("🔧 Tool: Generate Response called...")
    response = llm_chain.invoke({
        "query": state["query"],
        "retrieved_docs": state["retrieved_docs"],
        "fraud_api_result": state["fraud_api_result"]
    })
    return {**state, "response": response}

In [14]:
graph = StateGraph(FraudDetectionState)

# Add nodes
graph.add_node("vector_search", retrieve_docs)
graph.add_node("fraud_check", check_fraud_api)
graph.add_node("generate_response", generate_response)

# Define flow
graph.set_entry_point("vector_search")
graph.add_edge("vector_search", "fraud_check")
graph.add_edge("fraud_check", "generate_response")
graph.add_edge("generate_response", END)

# Compile the app
fraud_graph = graph.compile()

## 8. Test Query

In [15]:
initial_state = {"query": "What can you tell me about this IP: 198.51.100.23?"}
final_state = fraud_graph.invoke(initial_state)
print("Final State:", final_state)
print("✅ Final Response:", final_state["response"].content)

🔧 Tool: Vector Search called...
🔧 Tool: Check Fraud API called...
🔧 Tool: Generate Response called...
Final State: {'query': 'What can you tell me about this IP: 198.51.100.23?', 'retrieved_docs': '"IP spoofing is a method of fraudulent access where attackers impersonate IP addresses."\n"Transaction from a suspicious IP address flagged by fraud detection API."\n"Phishing is a fraudulent attempt to obtain sensitive information by disguising as a trustworthy entity."\n"Large withdrawals followed by immediate transfer to multiple accounts is a red flag."\n"Credit card fraud involves unauthorized use of a credit or debit card."', 'fraud_api_result': 'IP: 198.51.100.23,Fraud Score: 0, VPN: False, Proxy: False, Tor: False, Crawler: None, Recent Abuse: False, Bot: None', 'response': AIMessage(content="Based on the information available, IP address 198.51.100.23 has a fraud score of 0 and is not associated with a VPN, proxy, Tor network, or known crawler. There's no indication of recent abuse 

## 9. Run experiments using MLFLow

In [17]:
# Sample queries and expected outcomes for evaluation
evaluation_data = [
    {"query": "Check if 8.8.8.8 is suspicious", "expected_contains": "not suspicious"},
    {"query": "How do I upgrade my account?", "expected_contains": "Go to Settings"},
    {"query": "This email looks suspicious: scam@fraudmail.com", "expected_contains": "fraud score"},
]

In [25]:
with mlflow.start_run(run_name="fraud_agent_eval_langgraph"):

    # Log basic metadata
    mlflow.log_param("embedding_model", EMBEDDING_MODEL)
    mlflow.log_param("llm_model", GENAI_MODEL)

    scores = []

    for idx, row in enumerate(evaluation_data):
        initial_state = {"query": row["query"]}
        result = fraud_graph.invoke(initial_state)

        # LangGraph returns state – get the final response
        response_text = result.get("response", "")
        print(f"\n🧪 Query {idx + 1}: {row['query']}")
        print(f"✅ Agent Response: {response_text.content}")

        # Simple matching for evaluation
        success = row["expected_contains"].lower() in response_text.content.lower()
        scores.append(success)

        # Log each result
        mlflow.log_param(f"query_{idx + 1}", row["query"])
        mlflow.log_param(f"result_{idx + 1}", response_text.content)
        
        mlflow.log_metric(f"query_{idx + 1}_success", int(success))

        usage = result["response"].usage_metadata
        mlflow.log_metric("output_token_length", usage["output_tokens"])
        mlflow.log_metric("input_token_length", usage["input_tokens"])

    # Aggregate metric
    accuracy = sum(scores) / len(scores)
    mlflow.log_metric("overall_accuracy", accuracy)
    print(f"\n📊 Final Evaluation Accuracy: {accuracy:.2f}")

🔧 Tool: Vector Search called...
🔧 Tool: Check Fraud API called...
🔧 Tool: Generate Response called...

🧪 Query 1: Check if 8.8.8.8 is suspicious
✅ Agent Response: Based on the information provided, 8.8.8.8 does not appear to be suspicious.

*   **Fraud Check Result:** The fraud detection API returned a Fraud Score of 0 for the IP address 8.8.8.8. It is not associated with VPNs, proxies, Tor networks, crawlers, recent abuse, or bots.

*   **Knowledge Context:** While the knowledge context mentions suspicious IP addresses, IP spoofing, and phishing, the specific IP address 8.8.8.8 is not flagged as suspicious by the fraud detection API.

**Recommendation:**

The IP address 8.8.8.8 itself does not raise any immediate red flags based on the available information. However, it's crucial to remember that this is just one data point. Consider the context of the transaction or activity associated with this IP address. If there are other suspicious indicators (e.g., large withdrawals, transfers 