In [3]:
import sys
print(sys.executable)

/opt/anaconda3/bin/python


In [None]:
# Setup & Imports
# Load environment variables
from dotenv import load_dotenv
import os

load_dotenv()

# Verify API key is loaded
api_key = os.getenv('OPENAI_API_KEY')
if api_key:
    print(f'API Key loaded: {api_key[:8]}...')
else:
    print('ERROR: No API key found! Check your .env file')

API Key loaded: sk-proj-...


In [None]:
# Import Libraries
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from pathlib import Path
import git

print('All imports successful!')

All imports successful!


In [None]:
# Clone a GitHub Repository

# Configuration
REPO_URL = 'https://github.com/tiangolo/fastapi'
LOCAL_PATH = Path('../data/repos/fastapi')

# Only clone the docs and main source (not everything)
# We'll use a smaller example for testing

# For now, let's use an even simpler repo
REPO_URL = 'https://github.com/httpie/cli'  # HTTPie - small CLI tool
LOCAL_PATH = Path('../data/repos/httpie')

# Create data directory
LOCAL_PATH.parent.mkdir(parents=True, exist_ok=True)

# Clone (shallow clone - only latest commit)
if LOCAL_PATH.exists():
    print(f'üìÅ Repo already exists at {LOCAL_PATH}')
else:
    print(f'‚è≥ Cloning {REPO_URL}...')
    git.Repo.clone_from(REPO_URL, LOCAL_PATH, depth=1)
    print('‚úÖ Clone complete!')

‚è≥ Cloning https://github.com/httpie/cli...
‚úÖ Clone complete!


In [None]:
# Load Code Files
def load_code_files(repo_path: Path, extensions: list = ['.py', '.md']) -> list:
    """
    Load code files from a repository.
    
    Args:
        repo_path: Path to the repository
        extensions: File extensions to include
    
    Returns:
        List of Document objects
    """
    documents = []
    
    for ext in extensions:
        for file_path in repo_path.rglob(f'*{ext}'):
            # Skip hidden files and directories
            if any(part.startswith('.') for part in file_path.parts):
                continue
            # Skip common non-essential directories
            if any(part in ['__pycache__', 'node_modules', 'venv', '.venv', 'test', 'tests'] for part in file_path.parts):
                continue
                
            try:
                content = file_path.read_text(encoding='utf-8')
                
                # Skip empty files
                if not content.strip():
                    continue
                    
                relative_path = file_path.relative_to(repo_path)
                
                doc = Document(
                    page_content=content,
                    metadata={
                        'source': str(relative_path),
                        'file_type': ext,
                        'file_name': file_path.name
                    }
                )
                documents.append(doc)
                
            except Exception as e:
                print(f'Error reading {file_path}: {e}')
    
    return documents

# Load documents
documents = load_code_files(LOCAL_PATH)
print(f'Loaded {len(documents)} documents')

# Show what we loaded
print(f'\nFiles loaded:')
for doc in documents[:10]:  # Show first 10
    print(f'   - {doc.metadata["source"]}')
if len(documents) > 10:
    print(f'   ... and {len(documents) - 10} more')

Loaded 0 documents

Files loaded:


In [None]:
# Debug - Check what's in the repo
# Let's see what's actually in the repo
import os

print(f'üìÅ Checking: {LOCAL_PATH}')
print(f'   Exists: {LOCAL_PATH.exists()}')

# List top-level contents
print(f'\nüìÇ Top-level contents:')
for item in LOCAL_PATH.iterdir():
    print(f'   {item.name}')

# Find all Python files
print(f'\nüêç All .py files:')
py_files = list(LOCAL_PATH.rglob('*.py'))
print(f'   Found {len(py_files)} Python files')
for f in py_files[:5]:
    print(f'   - {f.relative_to(LOCAL_PATH)}')

üìÅ Checking: ../data/repos/httpie
   Exists: True

üìÇ Top-level contents:
   CODE_OF_CONDUCT.md
   AUTHORS.md
   pytest.ini
   LICENSE
   CHANGELOG.md
   Makefile
   tests
   MANIFEST.in
   docs
   .editorconfig
   README.md
   setup.py
   .gitignore
   CONTRIBUTING.md
   .packit.yaml
   .github
   setup.cfg
   httpie
   .git
   snapcraft.yaml
   SECURITY.md
   extras

üêç All .py files:
   Found 133 Python files
   - setup.py
   - tests/test_httpie.py
   - tests/test_cookie.py
   - tests/test_cli_ui.py
   - tests/conftest.py


In [None]:
# Fixed Loader
def load_code_files(repo_path: Path, extensions: list = ['.py', '.md']) -> list:
    """
    Load code files from a repository.
    """
    documents = []
    
    # Directories to skip
    skip_dirs = {'__pycache__', 'node_modules', 'venv', '.venv', '.git'}
    
    for ext in extensions:
        for file_path in repo_path.rglob(f'*{ext}'):
            # Skip hidden files
            if file_path.name.startswith('.'):
                continue
            
            # Skip unwanted directories
            if any(part in skip_dirs for part in file_path.parts):
                continue
                
            try:
                content = file_path.read_text(encoding='utf-8')
                
                # Skip empty files
                if not content.strip():
                    continue
                    
                relative_path = file_path.relative_to(repo_path)
                
                doc = Document(
                    page_content=content,
                    metadata={
                        'source': str(relative_path),
                        'file_type': ext,
                        'file_name': file_path.name
                    }
                )
                documents.append(doc)
                
            except Exception as e:
                print(f'‚ö†Ô∏è Error reading {file_path}: {e}')
    
    return documents

# Load documents again
documents = load_code_files(LOCAL_PATH)
print(f'‚úÖ Loaded {len(documents)} documents')

# Show what we loaded
print(f'\nüìÑ Sample files:')
for doc in documents[:10]:
    content_preview = len(doc.page_content)
    print(f'   - {doc.metadata["source"]} ({content_preview} chars)')

‚úÖ Loaded 148 documents

üìÑ Sample files:
   - setup.py (38 chars)
   - tests/test_httpie.py (10381 chars)
   - tests/test_cookie.py (1876 chars)
   - tests/test_cli_ui.py (2025 chars)
   - tests/conftest.py (2558 chars)
   - tests/test_xml.py (2869 chars)
   - tests/test_redirects.py (3537 chars)
   - tests/test_auth.py (5167 chars)
   - tests/test_errors.py (2390 chars)
   - tests/test_offline.py (1979 chars)


In [None]:
# Chunk the Documents

# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,       # Max characters per chunk
    chunk_overlap=200,     # Overlap between chunks (helps keep context)
    length_function=len,
    separators=[
        '\n\nclass ',      # Split on class definitions
        '\n\ndef ',        # Split on function definitions
        '\n\n',            # Split on double newlines
        '\n',              # Split on single newlines
        ' ',               # Split on spaces
        ''                 # Last resort: split anywhere
    ]
)

# Split documents into chunks
chunks = text_splitter.split_documents(documents)

print(f'‚úÖ Created {len(chunks)} chunks from {len(documents)} documents')
print(f'üìä Average chunk size: {sum(len(c.page_content) for c in chunks) // len(chunks)} characters')

# Show a sample chunk
print(f'\nüìù Sample chunk from: {chunks[0].metadata["source"]}')
print(f'{"="*50}')
print(chunks[0].page_content[:500])
print(f'{"="*50}')

‚úÖ Created 1106 chunks from 148 documents
üìä Average chunk size: 707 characters

üìù Sample chunk from: setup.py
from setuptools import setup

setup()


In [15]:
# Create Embeddings & Vector Store

# Initialize embeddings model
embeddings = OpenAIEmbeddings(
    model='text-embedding-3-small'  # Cheap: $0.02 per 1M tokens
)

# Create vector store directory
persist_dir = Path('../data/chroma_db')
persist_dir.mkdir(parents=True, exist_ok=True)

print('‚è≥ Creating vector store (this may take a minute)...')
print(f'   Embedding {len(chunks)} chunks...')

# Create vector store with all chunks
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=str(persist_dir)
)

print(f'‚úÖ Vector store created!')
print(f'üìä Total vectors stored: {vectorstore._collection.count()}')

‚è≥ Creating vector store (this may take a minute)...
   Embedding 1106 chunks...


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


‚úÖ Vector store created!
üìä Total vectors stored: 1106


In [16]:
# Create Retriever & Test It

# Create retriever
retriever = vectorstore.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 5}  # Return top 5 most similar chunks
)

# Test retrieval with a question
test_query = "How does HTTPie handle authentication?"
retrieved_docs = retriever.invoke(test_query)

print(f'üîç Query: {test_query}')
print(f'üìö Retrieved {len(retrieved_docs)} chunks:\n')

for i, doc in enumerate(retrieved_docs, 1):
    print(f'--- Chunk {i}: {doc.metadata["source"]} ---')
    print(doc.page_content[:300])
    print()

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


üîç Query: How does HTTPie handle authentication?
üìö Retrieved 5 chunks:

--- Chunk 1: docs/README.md ---
### Basic auth

```bash
$ http -a username:password pie.dev/basic-auth/username/password
```

### Digest auth

```bash
$ http -A digest -a username:password pie.dev/digest-auth/httpie/username/password
```

### Bearer auth

```bash
https -A bearer -a token pie.dev/bearer
```

### Password prompt

If

--- Chunk 2: docs/README.md ---
```http
GET / HTTP/1.1
Accept: */*
Accept-Encoding: gzip, deflate
Connection: keep-alive
Cookie: sessionid=foo
Host: pie.dev
User-Agent: HTTPie/0.9.9
```

Send multiple cookies (note: the header is quoted to prevent the shell from interpreting the `;`):

```bash
$ http pie.dev/cookies 'Cookie:sessio

--- Chunk 3: httpie/client.py ---
if httpie_session:
        httpie_session.update_headers(request_kwargs['headers'])
        requests_session.cookies = httpie_session.cookies
        if args.auth_plugin:
            # Save auth from CLI to HTTPie session

In [17]:
# Build the RAG Chain

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Initialize LLM
llm = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0  # More deterministic responses
)

# Create prompt template
prompt = ChatPromptTemplate.from_template('''
You are a helpful code assistant. Answer the question based on the provided context from the codebase.

Context:
{context}

Question: {question}

Instructions:
- Answer based ONLY on the provided context
- If the answer is not in the context, say "I don't have enough information to answer this."
- Include relevant file names when referencing code
- Be concise but thorough
''')

# Helper function to format retrieved docs
def format_docs(docs):
    return "\n\n---\n\n".join(
        f"File: {doc.metadata['source']}\n{doc.page_content}" 
        for doc in docs
    )

# Build the RAG chain
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

print('‚úÖ RAG chain created!')

‚úÖ RAG chain created!


In [18]:
# Test the RAG system

# Ask a question!
question = "How does HTTPie handle authentication?"
print(f'‚ùì Question: {question}\n')

response = rag_chain.invoke(question)
print(f'ü§ñ Answer:\n{response}')

‚ùì Question: How does HTTPie handle authentication?

ü§ñ Answer:
HTTPie handles authentication through several supported schemes, including Basic, Digest, and Bearer authentication. The authentication can be specified using the `-a` or `--auth` flag, followed by the username and password. For example, Basic authentication can be used as follows:

```bash
$ http -a username:password pie.dev/basic-auth/username/password
```

If the password is omitted, HTTPie will prompt the user to enter it securely:

```bash
$ http -a username pie.dev/basic-auth/username/password
```

For sending an empty password, a trailing colon can be included:

```bash
$ http -a username: pie.dev/headers
```

Additionally, HTTPie can read authentication information from the `~/.netrc` file, which can be disabled with the `--ignore-netrc` option:

```bash
$ http --ignore-netrc pie.dev/basic-auth/httpie/test
```

The code in `httpie/client.py` shows that if an HTTPie session is active, it updates the headers and a

In [19]:
# Let's Test a Few More Questions

# Test multiple questions
questions = [
    "How do I send JSON data with HTTPie?",
    "What file formats can HTTPie output?",
    "How do I use HTTPie with cookies?",
]

for q in questions:
    print(f'‚ùì {q}')
    print(f'ü§ñ {rag_chain.invoke(q)}')
    print('\n' + '='*60 + '\n')

‚ùì How do I send JSON data with HTTPie?
ü§ñ To send JSON data with HTTPie, you can use the `=` or `:=` syntax to specify data fields that will be serialized into a JSON object. By default, HTTPie sets the `Content-Type` header to `application/json` when you include data fields. Here‚Äôs how you can do it:

1. **Basic JSON Data**: Use the `=` syntax for string values.
   ```bash
   $ http PUT pie.dev/put name=John email=john@example.org
   ```

2. **Non-string JSON Data**: Use the `:=` syntax for non-string values (e.g., booleans, numbers).
   ```bash
   $ http POST pie.dev/post user[name]:=John user[age]:=30
   ```

3. **Nested JSON**: Specify a path declaration to create complex JSON objects.
   ```bash
   $ http POST pie.dev/post 'user[name]:=John' 'user[age]:=30'
   ```

4. **Raw JSON**: For very complex JSON structures, you can pass it as a raw request body using `echo` or by redirecting from a file.
   ```bash
   $ echo -n '{"hello": "world"}' | http POST pie.dev/post
   ```
   

In [20]:
# Create a Simple Chat Function

def ask_codebase(question: str) -> str:
    """Ask a question about the codebase."""
    return rag_chain.invoke(question)

# Interactive usage
while True:
    question = input('\n‚ùì Ask a question (or "quit" to exit): ')
    if question.lower() in ['quit', 'exit', 'q']:
        print('üëã Goodbye!')
        break
    
    response = ask_codebase(question)
    print(f'\nü§ñ {response}')


ü§ñ To use HTTPie with cookies, you can send cookies as regular HTTP headers using the `Header:Value` notation. Here are the instructions based on the provided context:

1. **Send a Single Cookie**:
   You can send a single cookie by specifying it in the command like this:
   ```bash
   $ http pie.dev/cookies Cookie:sessionid=foo
   ```
   This will result in the following HTTP request:
   ```http
   GET / HTTP/1.1
   Accept: */*
   Accept-Encoding: gzip, deflate
   Connection: keep-alive
   Cookie: sessionid=foo
   Host: pie.dev
   User-Agent: HTTPie/0.9.9
   ```

2. **Send Multiple Cookies**:
   To send multiple cookies, you need to quote the header to prevent the shell from interpreting the `;` character:
   ```bash
   $ http pie.dev/cookies 'Cookie:sessionid=foo;another-cookie=bar'
   ```
   This will result in the following HTTP request:
   ```http
   GET / HTTP/1.1
   Accept: */*
   Accept-Encoding: gzip, deflate
   Connection: keep-alive
   Cookie: sessionid=foo;another-cookie