In [1]:
import pandas as pd
from pathlib import Path
import re
from typing import List, Optional
import ast
import os
import re
from pathlib import Path
from typing import List, Dict
import pandas as pd
import numpy as np
from datetime import datetime
from openai import OpenAI
from directory_tree import DisplayTree

client = OpenAI(max_retries=5, api_key = "") # past your api key

In [2]:
# Constants
NEWLINE = '\n'
DEF_PREFIXES = ('def ', 'async def ')
CLASS_PREFIX = 'class '
IMPORT_PATTERN = re.compile(r'^(?:from\s+\S+\s+import\s+\S+|import\s+\S+)')
COMMENT_PATTERN = re.compile(r'^\s*#')
DOCSTRING_PATTERN = re.compile(r'^\s*(\'\'\'|\"\"\")')
ASSIGNMENT_PATTERN = re.compile(r'^\s*\w+\s*=')

def get_embedding(text: str, model="text-embedding-3-small", **kwargs) -> list:
    # Ensure text is a string
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")
    
    # Replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")
    
    # Create the embedding
    response = client.embeddings.create(input=text, model=model, **kwargs)
    
    return response.data[0].embedding

def extract_function_name(line: str) -> str:
    """
    Extract the function name from a line starting with 'def' or 'async def'.
    """
    for prefix in DEF_PREFIXES:
        if line.startswith(prefix):
            return line[len(prefix):line.index('(')].strip()
    return ""

def extract_class_name(line: str) -> str:
    """
    Extract the class name from a line starting with 'class'.
    """
    if line.startswith(CLASS_PREFIX):
        return line[len(CLASS_PREFIX):line.index('(') if '(' in line else len(line)].strip()
    return ""

def extract_code_block(lines: List[str], start_index: int) -> str:
    """
    Extract a block of code (function or class) starting from the given index.
    """
    block = [lines[start_index]]
    indent_level = len(lines[start_index]) - len(lines[start_index].lstrip())
    for line in lines[start_index + 1:]:
        current_indent = len(line) - len(line.lstrip())
        if current_indent > indent_level or not line.strip():
            block.append(line)
        else:
            break
    return NEWLINE.join(block)

def get_file_metadata(filepath: Path) -> Dict:
    """
    Retrieve metadata for the given file.
    """
    stats = filepath.stat()
    return {
        'file_size': stats.st_size,
        'creation_time': datetime.fromtimestamp(stats.st_ctime),
        'modification_time': datetime.fromtimestamp(stats.st_mtime),
        'permissions': oct(stats.st_mode)[-3:]
    }

def analyze_python_file(filepath: Path, code_root: Path) -> Dict:
    """
    Analyze a Python file to extract its structural elements and metadata.
    """
    with open(filepath, 'r', encoding='utf-8', errors='replace') as file:
        content = file.read()

    # Initialize containers for different code elements
    functions, classes, imports, assignments, top_level_code = [], [], [], [], []

    # To parse the AST of the file content
    try:
        tree = ast.parse(content, filename=str(filepath))
    except SyntaxError:
        print(f"Syntax error in file: {filepath}")
        return {}

    # Visitor class to traverse the AST
    class CodeVisitor(ast.NodeVisitor):
        def __init__(self):
            self.current_class = None

        def visit_Import(self, node):
            imports.append(ast.unparse(node).strip())
            self.generic_visit(node)

        def visit_ImportFrom(self, node):
            imports.append(ast.unparse(node).strip())
            self.generic_visit(node)

        def visit_FunctionDef(self, node):
            func_info = {
                'name': node.name,
                'code': ast.unparse(node).strip()
            }
            if self.current_class:
                func_info['class'] = self.current_class
            functions.append(func_info)
            self.generic_visit(node)

        def visit_ClassDef(self, node):
            class_info = {
                'name': node.name,
                'code': ast.unparse(node).strip()
            }
            classes.append(class_info)
            # To traverse methods within the class
            self.current_class = node.name
            self.generic_visit(node)
            self.current_class = None

        def visit_Assign(self, node):
            assignments.append(ast.unparse(node).strip())
            self.generic_visit(node)

        def visit_Expr(self, node):
            if isinstance(node.value, ast.Str): 
                pass
            else:
                top_level_code.append(ast.unparse(node).strip())
            self.generic_visit(node)

    # To visit the AST nodes
    visitor = CodeVisitor()
    visitor.visit(tree)

    # To gather file metadata
    stats = filepath.stat()
    file_metadata = {
        'file_size': stats.st_size,
        'creation_time': datetime.fromtimestamp(stats.st_ctime),
        'modification_time': datetime.fromtimestamp(stats.st_mtime),
        'permissions': oct(stats.st_mode)[-3:]
    }

    return {
        'filepath': filepath.relative_to(code_root),
        'content': content,
        'imports': imports,
        'functions': functions,
        'classes': classes,
        'assignments': assignments,
        'top_level_code': top_level_code,
        **file_metadata
    }

def extract_repository_details(code_root: str) -> List[Dict]:
    """
    Extract details from all Python files in the specified code repository.
    """
    code_root_path = Path(code_root).resolve()
    python_files = list(code_root_path.rglob('*.py'))

    if not python_files:
        print('No Python files found in the specified directory.')
        return []

    all_files_data = [analyze_python_file(file, code_root_path) for file in python_files]
    return all_files_data

def process_files_data(all_files_data: List[Dict], code_root: str) -> pd.DataFrame:
    """
    Process the extracted file data into a DataFrame and generate embeddings.
    """
    entries = []

    for file_data in all_files_data:
        filepath = file_data['filepath']
        content = file_data['content']

        # Full file content entry
        entries.append({
            'filepath': filepath,
            'code': content,
            'type': 'file',
            'name': None,
            'embedding': get_embedding(content),
            'imports': file_data.get('imports', []),
            'comments': file_data.get('comments', []),
            'assignments': file_data.get('assignments', []),
            'top_level_code': file_data.get('top_level_code', [])
        })

        # Function-level entries
        for func in file_data.get('functions', []):
            entries.append({
                'filepath': filepath,
                'code': func['code'],
                'type': 'function',
                'name': func['name'],
                'embedding': get_embedding(func['code']),
                'imports': file_data.get('imports', []),
                'comments': file_data.get('comments', []),
                'assignments': file_data.get('assignments', []),
                'top_level_code': file_data.get('top_level_code', [])
            })

        # Class-level entries
        for cls in file_data.get('classes', []):
            entries.append({
                'filepath': filepath,
                'code': cls['code'],
                'type': 'class',
                'name': cls['name'],
                'embedding': get_embedding(cls['code']),
                'imports': file_data.get('imports', []),
                'comments': file_data.get('comments', []),
                'assignments': file_data.get('assignments', []),
                'top_level_code': file_data.get('top_level_code', [])
            })

    df = pd.DataFrame(entries)
    return df

def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    """
    Compute the cosine similarity between two vectors.
    """
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
 

def search_functions(df, code_query, n=3):
    """
    Search for Python files in the DataFrame that are most similar to the code_query.

    Args:
        df (pd.DataFrame): DataFrame containing code data with embeddings.
        code_query (str): The code snippet to search for.
        n (int): Number of top similar files to return.

    Returns:
        List[Dict]: List of dictionaries containing file paths and code content of the most similar Python files.
    """
    # Generate embedding for the query
    query_embedding = get_embedding(code_query)

    # Calculate similarities
    df['similarity'] = df['embedding'].apply(lambda x: cosine_similarity(x, query_embedding))

    # Sort by similarity and filter for file-level entries
    top_files = df[df['type'] == 'file'].sort_values(by='similarity', ascending=False).head(n)

    # Extract file paths and code content
    similar_files = top_files[['filepath', 'code']].to_dict(orient='records')

    return similar_files

def process_question(df, question, tree_structure, n=5):
    """
    Processes a user's question by searching the DataFrame and generating answers.

    Parameters:
    - df: DataFrame containing the data to search.
    - question: The user's question.
    - tree_structure: The folder structure to consider.
    - n: Number of search results to retrieve (default is 5).

    Returns:
    - The assistant's response containing the code update and file details.
    """
    # To perform the search on the DataFrame
    res = search_functions(df, question, n)
    messages = [
        {"role": "user", "content": f"You are a Python expert. Use code {res} and answer {question}. Also mention what file are used. Use this folder structure as well: {tree_structure}"}
    ]

    # Generate the response
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )

    # Return the response
    return completion.choices[0].message.content

In [None]:
# Path to your repository
path = "demo_sample_app/"

# Extract all functions from the repository
all_funcs = extract_repository_details(path)
df = process_files_data(all_funcs, path)

# Repo directory tree as a string
tree_structure = DisplayTree(dirPath=path, stringRep=True)

In [None]:
result = process_question(df, "", tree_structure) #ask question about you codebase

In [18]:
# Example usage
response = process_question(df, "What data is being used in the code?", tree_structure)
print(response)


The code is utilizing three distinct sets of data, each designed for different visualizations:

1. **Table Data**:
   - **Variables**: "Name", "Age", and "City".
   - **Dataset**: 
     ```python
     table_data = pd.DataFrame({
         "Name": ["Alice", "Bob", "Charlie", "David"],
         "Age": [25, 30, 35, 40],
         "City": ["New York", "San Francisco", "Los Angeles", "Chicago"]
     })
     ```
   - **Usage**: This data is imported and used in the `pages/page1.py` file to render a table displaying names, ages, and cities.

2. **Chart Data**:
   - **Variables**: "Category" and "Values".
   - **Dataset**:
     ```python
     chart_data = pd.DataFrame({
         "Category": ["A", "B", "C", "D"],
         "Values": [10, 20, 15, 25]
     })
     ```
   - **Usage**: This data is imported and used in the `pages/page2.py` file to create a bar chart illustrating values for different categories.

3. **Bubble Chart Data**:
   - **Variables**: "X", "Y", "Size", "Category", and "Label".
 

In [19]:
# Example usage
response = process_question(df, "expain the layout of page 1", tree_structure)
print(response)


To explain the layout of Page 1, we will focus on the content of the `page1.py` file within the given file structure. This file contains the logic for the "Table Page", which is part of the multi-page Dash application.

### Layout of Page 1 (`page1.py`)

#### Purpose:
Page 1 is designed to display a data table using the Dash DataTable component. It is registered as a page with the path "/page-1".

#### Code Explanation:
1. **Imports**:
   - `html` and `dash_table`: These are components from the Dash framework to create HTML elements and tables.
   - `register_page`: This function is used to register the page as part of the multi-page application, assigning it the path "/page-1".
   - `table_data`: This is the data imported from a module `data`. The `table_data` is assumed to be a pandas DataFrame containing the data to be displayed on this page.

2. **Page Registration**:
   - `register_page(__name__, path="/page-1")`: This line registers the current module (`page1.py`) as a page with 

In [23]:
# Example usage
response = process_question(df, "to add new page named 'bubble chart' with other already existing pages and in it add bubble chart with new dummy data.", tree_structure)
print(response)



To add a new page named 'Bubble Chart' to your existing mult-page Dash application, you will need to create a new Python file for this page (i.e., `page3.py`), update the `data.py` file with new dummy data for the bubble chart, and ensure that the new page is integrated with the existing application structure.

Here is a step-by-step guide on how to do this:

### 1. Update `data.py` to include Bubble Chart Data

Add new dummy data for the bubble chart in your `data.py` file. Here's how you can do it:

```python
import pandas as pd

# Dummy data for the table
table_data = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, 30, 35, 40],
    "City": ["New York", "San Francisco", "Los Angeles", "Chicago"]
})

# Dummy data for the bar chart
chart_data = pd.DataFrame({
    "Category": ["A", "B", "C", "D"],
    "Values": [10, 20, 15, 25]
})

# Dummy data for the bubble chart
bubble_chart_data = pd.DataFrame({
    "Category": ["X", "Y", "Z"],
    "Value1": [10, 40,