In [2]:
import os
import getpass

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')

In [3]:
embeddings = OpenAIEmbeddings(chunk_size=1, openai_api_key=os.environ['OPENAI_API_KEY'])

In [4]:
import os
from langchain.document_loaders import TextLoader

root_dir = '/home/sozercan/projects/gatekeeper'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try: 
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e: 
            pass

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [None]:
username = "sozercan" # replace with your username from app.activeloop.ai
db = DeepLake(dataset_path=f"hub://{username}/gatekeeper", embedding_function=embeddings) #dataset would be publicly available
db.add_documents(texts)

In [None]:
db = DeepLake(dataset_path="hub://sozercan/gatekeeper", read_only=True, embedding_function=embeddings)

In [10]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

In [11]:
def filter(x):
    # filter based on source code
    if 'com.google' in x['text'].data()['value']:
        return False
    
    # filter based on path e.g. extension
    metadata =  x['metadata'].data()['value']
    return 'scala' in metadata['source'] or 'py' in metadata['source']

### turn on below for custom filtering
# retriever.search_kwargs['filter'] = filter

In [12]:
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = AzureChatOpenAI(
    openai_api_base=os.environ["OPENAI_API_BASE"],
    openai_api_version="2023-03-15-preview",
    deployment_name="gpt-4-0314",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    openai_api_type = "azure",
)
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)

In [15]:
questions = [
    "what is the purpose of gatekeeper?",
    "how do i install gatekeeper?",
    "how does audit work?",
    "how do i write a constraint?",
    "how does external data work?",
    "what does auditFromCache do?",
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: what is the purpose of gatekeeper? 

**Answer**: The purpose of Gatekeeper is to provide a policy-based control system for Kubernetes clusters. It allows administrators to enforce and manage policies, ensuring that only compliant resources are created or modified within the cluster. Gatekeeper strengthens compliance efforts and prevents bad state from slowing down the organization by detecting and rejecting non-compliant commits to an infrastructure-as-code system's source-of-truth. 

-> **Question**: how do i install gatekeeper? 

**Answer**: To install Gatekeeper on your Kubernetes cluster, follow these steps:

1. Ensure you meet the prerequisites:

   - Minimum Kubernetes version: Gatekeeper requires resources introduced in Kubernetes v1.16.
   - RBAC permissions: Make sure you have cluster admin permissions by running the following command:

     ```sh
     kubectl create clusterrolebinding cluster-admin-binding \
       --clusterrole cluster-admin \
       --user 