In [1]:
# Install dependencies
!pip install -q google-generativeai faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Imports
import os
import json
import faiss
import numpy as np
import google.generativeai as genai

In [4]:
# Set Gemini API key
GEMINI_API_KEY = ""
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Gemini Model
model = genai.GenerativeModel('gemini-2.0-flash')

## Old

In [19]:
# Few-shot prompt template (embedded in explanation)
FEW_SHOT_EXAMPLES = """
Examples:

Code:
def add(a, b):
    return a + b

Response:
{
  "summary": "This function performs addition of two numbers.",
  "functions": [
    {
      "name": "add",
      "description": "Returns the sum of two input values a and b."
    }
  ],
  "missing_docstrings": "def add(a, b):\\n    '''Adds two numbers and returns the result.'''",
  "potential_improvements": "Add type hints for better clarity."
}
"""

In [20]:
# Generate structured explanation (with few-shot prompting)
def generate_code_explanation(code_snippet: str):
    prompt = f"""
You are a code documentation assistant. Respond only in JSON with:
- summary
- functions (name and description)
- missing docstrings
- potential improvements

{FEW_SHOT_EXAMPLES}

Now analyze this code:

Code:
{code_snippet}
"""
    response = model.generate_content(prompt)
    try:
        return json.loads(response.text)
    except json.JSONDecodeError:
        return {"error": "Model output not valid JSON", "raw_output": response.text}

In [21]:
# Generate embeddings
def get_code_embedding(code_snippet: str):
    response = genai.embed_content(
        model="models/embedding-001",
        content=code_snippet,
        task_type="retrieval_document"
    )
    return response['embedding']

In [22]:
# Store embeddings and implement FAISS index
embedding_index = faiss.IndexFlatL2(768)  # 768 is the size of Gemini embeddings
code_snippets = []

def store_code_snippet(code_snippet: str):
    embedding = np.array(get_code_embedding(code_snippet)).astype("float32")
    embedding_index.add(np.array([embedding]))
    code_snippets.append(code_snippet)

def search_similar_code(query_snippet: str, top_k=1):
    query_embedding = np.array(get_code_embedding(query_snippet)).astype("float32")
    D, I = embedding_index.search(np.array([query_embedding]), top_k)
    return [code_snippets[i] for i in I[0]]

In [23]:
# RAG-style loop (retrieve similar snippet and pass it in context)
def rag_enhanced_explanation(query_code: str):
    similar_snippets = search_similar_code(query_code)
    context = "\n\n".join(similar_snippets)
    combined_prompt = f"""
You are a code explanation assistant. Use the context below to help generate better explanation.

Context:
{context}

New Code:
{query_code}

Return your response in this structured JSON format:
{{
  "summary": "...",
  "functions": [...],
  "missing_docstrings": "...",
  "potential_improvements": "..."
}}
"""
    response = model.generate_content(combined_prompt)
    try:
        return json.loads(response.text)
    except json.JSONDecodeError:
        return {"error": "Model output not valid JSON", "raw_output": response.text}

In [24]:
# Evaluate explanation quality (basic)
def evaluate_explanation_quality(explanation: dict):
    if "summary" in explanation and len(explanation["summary"].split()) > 3:
        return "Good summary"
    return "Summary is too short or missing"

In [25]:
# Sample code to check
sample_code = """
def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)
"""

In [26]:
# Store code snippet for vector search
store_code_snippet(sample_code)

# Run explanation (basic + RAG-enhanced)
basic_expl = generate_code_explanation(sample_code)
rag_expl = rag_enhanced_explanation(sample_code)

print("\nBasic Explanation:\n", json.dumps(basic_expl, indent=2))
print("\nRAG Explanation:\n", json.dumps(rag_expl, indent=2))
print("\nEvaluation:", evaluate_explanation_quality(rag_expl))

# Embedding preview
embedding = get_code_embedding(sample_code)
print("\n🔢 Code Embedding (first 5 dims):", embedding[:5])



Basic Explanation:
 {
  "error": "Model output not valid JSON",
  "raw_output": "```json\n{\n  \"summary\": \"This function calculates the factorial of a non-negative integer.\",\n  \"functions\": [\n    {\n      \"name\": \"factorial\",\n      \"description\": \"Calculates the factorial of a number n using recursion.\"\n    }\n  ],\n  \"missing_docstrings\": \"def factorial(n):\\n    '''Calculates the factorial of a non-negative integer.\\n\\n    Args:\\n        n (int): The non-negative integer to calculate the factorial of.\\n\\n    Returns:\\n        int: The factorial of n.\\n\\n    Raises:\\n        ValueError: If n is negative.\\n    '''\",\n  \"potential_improvements\": \"Add input validation to check for negative input and raise a ValueError. Add type hints for better clarity.\"\n}\n```"
}

RAG Explanation:
 {
  "error": "Model output not valid JSON",
  "raw_output": "```json\n{\n  \"summary\": \"The code defines a recursive function called `factorial` that calculates the fac

# NEW

In [11]:
import re

# Few-shot prompt template (embedded in explanation)
FEW_SHOT_EXAMPLES = """
Examples:

Code:
def add(a, b):
    return a + b

Response:
{
  "summary": "This function performs addition of two numbers.",
  "functions": [
    {
      "name": "add",
      "description": "Returns the sum of two input values a and b."
    }
  ],
  "missing_docstrings": "def add(a, b):\\n    '''Adds two numbers and returns the result.'''",
  "potential_improvements": "Add type hints for better clarity."
}
"""

# Generate structured explanation (with few-shot prompting)
def generate_code_explanation(code_snippet: str):
    prompt = f"""
You are a code documentation assistant. Respond only in JSON with:
- summary
- functions (name and description)
- missing docstrings
- potential improvements

{FEW_SHOT_EXAMPLES}

Now analyze this code:

Code:
{code_snippet}
"""
    response = model.generate_content(prompt)
    return extract_json_from_response(response.text)

# Helper function to extract JSON from response that might contain markdown code blocks
def extract_json_from_response(text):
    try:
        # First try direct parsing
        return json.loads(text)
    except json.JSONDecodeError:
        # If that fails, try to extract JSON from markdown code blocks
        json_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
        match = re.search(json_pattern, text)
        if match:
            try:
                return json.loads(match.group(1))
            except json.JSONDecodeError:
                pass

        # If still no valid JSON, return error with raw output
        return {"error": "Model output not valid JSON", "raw_output": text}

# Generate embeddings
def get_code_embedding(code_snippet: str):
    response = genai.embed_content(
        model="models/embedding-001",
        content=code_snippet,
        task_type="retrieval_document"
    )
    return response['embedding']

# Store embeddings and implement FAISS index
embedding_dimension = 768  # 768 is the size of Gemini embeddings
embedding_index = faiss.IndexFlatL2(embedding_dimension)
code_snippets = []

def store_code_snippet(code_snippet: str):
    embedding = np.array([get_code_embedding(code_snippet)]).astype("float32")
    embedding_index.add(embedding)
    code_snippets.append(code_snippet)

def search_similar_code(query_snippet: str, top_k=1):
    query_embedding = np.array([get_code_embedding(query_snippet)]).astype("float32")
    D, I = embedding_index.search(query_embedding, top_k)
    return [code_snippets[i] for i in I[0]]

# RAG-style loop (retrieve similar snippet and pass it in context)
def rag_enhanced_explanation(query_code: str):
    similar_snippets = search_similar_code(query_code)
    context = "\n\n".join(similar_snippets)
    combined_prompt = f"""
You are a code explanation assistant. Use the context below to help generate better explanation.

Context:
{context}

New Code:
{query_code}

Return your response in this structured JSON format:
{{
  "summary": "...",
  "functions": [...],
  "missing_docstrings": "...",
  "potential_improvements": "..."
}}
"""
    response = model.generate_content(combined_prompt)
    return extract_json_from_response(response.text)

# Evaluate explanation quality (basic)
def evaluate_explanation_quality(explanation: dict):
    if "error" in explanation:
        return f"Error in explanation: {explanation['error']}"
    elif "summary" in explanation and len(explanation["summary"].split()) > 3:
        return "Good summary"
    return "Summary is too short or missing"

def main():
    # Check if model is properly initialized
    if model is None:
        print("Error: Model is not initialized. Please configure your AI model first.")
        return

    # Sample code to check
    sample_code = """
    def factorial(n):
        if n == 0:
            return 1
        else:
            return n * factorial(n-1)
    """

    # Store code snippet for vector search
    store_code_snippet(sample_code)

    # Run explanation (basic + RAG-enhanced)
    basic_expl = generate_code_explanation(sample_code)
    rag_expl = rag_enhanced_explanation(sample_code)

    print("\nBasic Explanation:\n", json.dumps(basic_expl, indent=2))
    print("\nRAG Explanation:\n", json.dumps(rag_expl, indent=2))
    print("\nEvaluation:", evaluate_explanation_quality(rag_expl))

    # Embedding preview
    embedding = get_code_embedding(sample_code)
    print("\n🔢 Code Embedding (first 5 dims):", embedding[:5])

if __name__ == "__main__":
    main()


Basic Explanation:
 {
  "summary": "This code defines a recursive function to calculate the factorial of a non-negative integer.",
  "functions": [
    {
      "name": "factorial",
      "description": "Calculates the factorial of a non-negative integer n using recursion."
    }
  ],
  "missing_docstrings": "def factorial(n):\n    '''Calculates the factorial of a number.'''",
  "potential_improvements": "Add a check for negative input to raise an error, and include type hints for the parameter and return value."
}

RAG Explanation:
 {
  "summary": "The code defines a recursive function to calculate the factorial of a non-negative integer.",
  "functions": [
    {
      "name": "factorial",
      "description": "This function calculates the factorial of a given non-negative integer n.\nIt uses recursion: the factorial of n is n multiplied by the factorial of (n-1).\nThe base case is when n is 0, in which case it returns 1 (since 0! = 1).",
      "parameters": [
        {
          "nam

In [12]:
import re  # For regex pattern matching

# Few-shot prompt template (embedded in explanation)
FEW_SHOT_EXAMPLES = """
Examples:

Code:
def add(a, b):
    return a + b

Response:
{
  "summary": "This function performs addition of two numbers.",
  "functions": [
    {
      "name": "add",
      "description": "Returns the sum of two input values a and b."
    }
  ],
  "missing_docstrings": "def add(a, b):\\n    '''Adds two numbers and returns the result.'''",
  "potential_improvements": "Add type hints for better clarity."
}
"""

# Generate structured explanation (with few-shot prompting)
def generate_code_explanation(code_snippet: str, output_format="json"):
    if output_format.lower() == "json":
        prompt = f"""
You are a code documentation assistant. Respond only in JSON with:
- summary
- functions (name and description)
- missing docstrings
- potential improvements

{FEW_SHOT_EXAMPLES}

Now analyze this code:

Code:
{code_snippet}
"""
    else:  # Story format
        prompt = f"""
You are a code documentation assistant. Analyze the following code and explain it as a story
in a creative, engaging way. Make the explanation accessible while still being technically accurate.
Include information about:
- What the code does
- The functions and their purpose
- Any missing documentation
- Potential improvements

Code:
{code_snippet}
"""

    response = model.generate_content(prompt)

    if output_format.lower() == "json":
        return extract_json_from_response(response.text)
    else:
        return {"story": response.text}

# Helper function to extract JSON from response that might contain markdown code blocks
def extract_json_from_response(text):
    try:
        # First try direct parsing
        return json.loads(text)
    except json.JSONDecodeError:
        # If that fails, try to extract JSON from markdown code blocks
        json_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
        match = re.search(json_pattern, text)
        if match:
            try:
                return json.loads(match.group(1))
            except json.JSONDecodeError:
                pass

        # If still no valid JSON, return error with raw output
        return {"error": "Model output not valid JSON", "raw_output": text}

# Generate embeddings
def get_code_embedding(code_snippet: str):
    response = genai.embed_content(
        model="models/embedding-001",
        content=code_snippet,
        task_type="retrieval_document"
    )
    return response['embedding']

# Store embeddings and implement FAISS index
embedding_dimension = 768  # 768 is the size of Gemini embeddings
embedding_index = faiss.IndexFlatL2(embedding_dimension)
code_snippets = []

def store_code_snippet(code_snippet: str):
    embedding = np.array([get_code_embedding(code_snippet)]).astype("float32")
    embedding_index.add(embedding)
    code_snippets.append(code_snippet)

def search_similar_code(query_snippet: str, top_k=1):
    query_embedding = np.array([get_code_embedding(query_snippet)]).astype("float32")
    D, I = embedding_index.search(query_embedding, top_k)
    return [code_snippets[i] for i in I[0]]

# RAG-style loop (retrieve similar snippet and pass it in context)
def rag_enhanced_explanation(query_code: str, output_format="json"):
    similar_snippets = search_similar_code(query_code)
    context = "\n\n".join(similar_snippets)

    if output_format.lower() == "json":
        combined_prompt = f"""
You are a code explanation assistant. Use the context below to help generate better explanation.

Context:
{context}

New Code:
{query_code}

Return your response in this structured JSON format:
{{
  "summary": "...",
  "functions": [...],
  "missing_docstrings": "...",
  "potential_improvements": "..."
}}
"""
    else:  # Story format
        combined_prompt = f"""
You are a code explanation assistant. Use the context below to help generate better explanation.

Context:
{context}

New Code:
{query_code}

Explain this code as an engaging story that a junior developer would find both entertaining and educational.
Include information about what the code does, its functions, any missing documentation, and potential improvements.
"""

    response = model.generate_content(combined_prompt)

    if output_format.lower() == "json":
        return extract_json_from_response(response.text)
    else:
        return {"story": response.text}

# Evaluate explanation quality (basic)
def evaluate_explanation_quality(explanation: dict):
    if "error" in explanation:
        return f"Error in explanation: {explanation['error']}"
    elif "story" in explanation:
        word_count = len(explanation["story"].split())
        if word_count > 100:
            return f"Good story explanation with {word_count} words"
        return f"Story explanation too short: {word_count} words"
    elif "summary" in explanation and len(explanation["summary"].split()) > 3:
        return "Good summary"
    return "Summary is too short or missing"

def print_explanation(explanation, output_format):
    if output_format.lower() == "json":
        print(json.dumps(explanation, indent=2))
    else:  # Story format
        if "story" in explanation:
            print("\n--- CODE STORY ---\n")
            print(explanation["story"])
            print("\n-----------------\n")
        else:
            print("Error generating story format")
            print(explanation)

def main():
    # Check if model is properly initialized
    if model is None:
        print("Error: Model is not initialized. Please configure your AI model first.")
        return

    while True:
        print("\n=== CODE DOCUMENTATION ASSISTANT ===")
        print("1. Analyze code")
        print("2. Exit")
        choice = input("Enter your choice (1-2): ")

        if choice == "2":
            print("Goodbye!")
            break

        if choice == "1":
            # Get code input
            print("\nEnter or paste your code (type 'DONE' on a new line when finished):")
            code_lines = []
            while True:
                line = input()
                if line == "DONE":
                    break
                code_lines.append(line)

            user_code = "\n".join(code_lines)

            if not user_code.strip():
                print("No code provided. Please try again.")
                continue

            # Get format preference
            format_choice = input("\nChoose output format (json/story): ").lower()
            output_format = "json" if format_choice == "json" else "story"

            # Determine analysis method
            analysis_method = input("\nUse RAG enhancement? (y/n): ").lower()

            print("\nAnalyzing code...")

            # Store for future RAG comparisons
            try:
                store_code_snippet(user_code)
                print("Code stored in vector database.")
            except Exception as e:
                print(f"Warning: Could not store code in vector database. Error: {e}")

            # Generate explanation
            try:
                if analysis_method == "y":
                    explanation = rag_enhanced_explanation(user_code, output_format)
                    print("\n=== RAG-ENHANCED EXPLANATION ===")
                else:
                    explanation = generate_code_explanation(user_code, output_format)
                    print("\n=== BASIC EXPLANATION ===")

                print_explanation(explanation, output_format)
                print("\nEvaluation:", evaluate_explanation_quality(explanation))

            except Exception as e:
                print(f"Error generating explanation: {e}")
        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    main()


=== CODE DOCUMENTATION ASSISTANT ===
1. Analyze code
2. Exit
Enter your choice (1-2): 1

Enter or paste your code (type 'DONE' on a new line when finished):
.long-word-to-break {     overflow-wrap: break-word;       word-wrap: break-word;      word-break: break-word;          /* Adds a hyphen where the word breaks */       hyphens: auto; }
DONE

Choose output format (json/story): story

Use RAG enhancement? (y/n): y

Analyzing code...
Code stored in vector database.

=== RAG-ENHANCED EXPLANATION ===

--- CODE STORY ---

Alright, junior dev, gather 'round the coding campfire! Let me tell you the tale of `.long-word-to-break`, a CSS class designed to tame those unruly, super-long words that threaten to bust out of their containers and wreak havoc on your carefully crafted layouts.

Imagine you're building a beautiful webpage, all pixel-perfect and responsive. Suddenly, a wild, untamed string of characters appears – maybe it's a ridiculously long URL, a generated code, or some scientific