# **FraudSleuth Gen AI**

## 1. Setup and Imports

In [16]:
# Environment
import os
from dotenv import load_dotenv
load_dotenv()

# LLM + LangChain
from google import genai
from google.genai import types
from google.api_core import retry
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.memory import ConversationBufferMemory
from langchain_google_genai.chat_models import ChatGoogleGenerativeAI


# ChromaDB
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings

# Requests for Fraud API
import requests
import json

import re

## 2. Config

In [2]:
# Example .env usage (you can inline if needed)
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
FRAUD_API_KEY = os.getenv("FRAUD_API_KEY")

GENAI_MODEL="gemini-2.0-flash"
EMBEDDING_MODEL = "models/embedding-001"
CHROMA_COLLECTION_NAME = "fraud_docs"

FRAUD_API_URL = "https://ipqualityscore.com/api/json/ip/"


In [3]:
# Authenticate Gemini
client = genai.Client(api_key = GEMINI_API_KEY)

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

## 3. Embedding Function

In [4]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    def __init__(self, document_mode=True):
        self.document_mode = document_mode

    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        
        response = client.models.embed_content(
                                    model = EMBEDDING_MODEL,
                                    contents = input,
                                    config = types.EmbedContentConfig(
                                        task_type = embedding_task,
                                    )
                                )
        
        # Return list of embedding vectors
        return [e.values for e in response.embeddings]

## 4. Chroma Vector Search

In [5]:
class ChromaVectorSearch:

    def __init__(self, persist_directory="chroma_db", collection_name = CHROMA_COLLECTION_NAME):
        
        self.persist_directory = persist_directory
        self.collection_name = collection_name
        self.client = chromadb.PersistentClient(path = self.persist_directory)
        self.embed_fn = GeminiEmbeddingFunction(document_mode = True)
    
        # Create or get collection
        self.collection = self.client.get_or_create_collection(name = self.collection_name,
                                                               embedding_function = self.embed_fn)
        
        self.load_documents_from_file("data/knowledge_base.txt")

    def load_documents_from_file(self, file_path: str):
        if os.path.exists(file_path):
            with open(file_path, "r") as f:
                docs = [line.strip() for line in f if line.strip()]
                self.add_documents(docs)
                print(f"✅ Loaded {len(docs)} documents from {file_path} into ChromaDB.")
        else:
            print(f"⚠️ File not found: {file_path}")

    def add_documents(self, documents: list[str], ids: list[str] = None):
        """
        Adds documents to ChromaDB with embeddings.
        """
        if not documents:
            print("⚠️ No documents to add.")
            return
        if ids is None:
            ids = [str(i) for i in range(len(documents))]
        self.collection.add(documents = documents, ids = ids)

    def query(self, text: str, n_results: int = 5):
        """
        Queries the collection for top matching documents.
        """

        results = self.collection.query(query_texts=[text], n_results = n_results)
        return results

    def reset_collection(self):
        self.client.delete_collection(self.collection_name)
        self.collection = self.client.get_or_create_collection(name = self.collection_name, 
                                                               embedding_function = self.embed_fn)

## 5. Fraud Detection API Tool

In [6]:
class FraudChecker:
    def __init__(self):
        self.api_key = FRAUD_API_KEY
        self.api_url = FRAUD_API_URL

    def check_ip(self, ip: str) -> str:
        try:
            url = f"{self.api_url}{self.api_key}/{ip}"
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()
                return f"Fraud Score: {data.get('fraud_score')}, VPN: {data.get('vpn')},"\
                       f" Proxy: {data.get('proxy')}, Tor: {data.get('tor')},"\
                       f" Crawler: {data.get('crawler')}, Recent Abuse: {data.get('recent_abuse')},"\
                       f" Bot: {data.get('is_bot')}"
            else:
                return f"Error from API: {response.status_code}"
        except Exception as e:
            return f"Exception during fraud check: {str(e)}"

## 6. LangChain Prompt Template and Model Setup

In [7]:
# Prompt template to synthesize context and external API result
prompt_template = PromptTemplate(
    input_variables=["query", "retrieved_docs", "fraud_api_result"],
    template="""
        You are a fraud analysis assistant.

        User Query:
        {query}

        Retrieved Knowledge Context:
        {retrieved_docs}

        Fraud Check Result (if applicable):
        {fraud_api_result}

        Based on all of the above, provide a clear and actionable response to the user.
        """
        )

# Set up LangChain Gemini Chat model
llm = ChatGoogleGenerativeAI(model = GENAI_MODEL, temperature = 0.2, google_api_key = GEMINI_API_KEY)
llm_chain = prompt_template | llm

## 7. Tool Definition and Agent

In [8]:
# Initialize tools
fraud_checker = FraudChecker()
vector_search = ChromaVectorSearch()

⚠️ File not found: data/knowledge_base.txt


In [9]:
def extract_ip_or_email(query: str):
    """Basic regex utility to extract IPs or emails from a query."""
    ip_match = re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', query)
    email_match = re.search(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', query)
    return ip_match.group(0) if ip_match else email_match.group(0) if email_match else None

In [10]:
def combined_tool(query: str) -> str:
    # 1. Get vector search results
    vector_results = vector_search.query(query)
    retrieved_docs = "\n".join(vector_results["documents"][0])

    # 2. If IP or email found, call fraud API
    suspicious_input = extract_ip_or_email(query)
    fraud_api_result = ""
    if suspicious_input:
        fraud_api_result = fraud_checker.check_ip(suspicious_input)
    else:
        fraud_api_result = "No IP or email detected in query."

    # 3. Run final prompt
    return llm_chain.invoke({
            "query": query,
            "retrieved_docs": retrieved_docs,
            "fraud_api_result": fraud_api_result
        })

In [11]:
# Create unified tool
combined_fraud_tool = Tool(
    name="FraudKnowledgeAndCheckTool",
    func=combined_tool,
    description="Use this to handle any query involving fraud detection, signs of fraud, or suspicious IP/email lookup."
)


In [12]:
# Memory for conversation context
memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)

# Initialize the agent
fraud_agent = initialize_agent(
    tools = [combined_fraud_tool],
    llm = llm,
    agent = AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    memory = memory,
    verbose = True
)

  memory = ConversationBufferMemory(memory_key = "chat_history", return_messages = True)
  fraud_agent = initialize_agent(


## 8. Test Query

In [17]:
fraud_agent.run("What can you tell me about this IP: 198.51.100.23? Is it suspicious?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "FraudKnowledgeAndCheckTool",
    "action_input": "198.51.100.23"
}
```[0m
Observation: [36;1m[1;3mcontent="Based on the information provided, the IP address 198.51.100.23 appears to be safe.\n\n**Here's a summary:**\n\n*   **Fraud Score: 0** - Indicates a very low risk of fraudulent activity associated with this IP address.\n*   **VPN: False, Proxy: False, Tor: False** - The IP address is not associated with VPNs, proxies, or the Tor network, which are often used to mask IP addresses.\n*   **Crawler: None, Bot: None** - The IP address is not associated with known web crawlers or bots.\n*   **Recent Abuse: False** - There's no recent history of abuse linked to this IP address.\n\n**Actionable Response:**\n\nYou can likely proceed with interactions involving this IP address with a high degree of confidence. The analysis suggests it's a legitimate user or server." additional_kwargs={} response_metadat

'The IP address 198.51.100.23 appears to be safe. It has a very low risk of fraudulent activity, is not associated with VPNs, proxies, or the Tor network, and has no recent history of abuse. It is likely a legitimate user or server.'