In [6]:
import re
from dataclasses import dataclass
from typing import List, Optional
from pathlib import Path
import os
from pathlib import Path
from openai import AzureOpenAI
from dotenv import load_dotenv
import json
from dataclasses import asdict
# Load .env
load_dotenv()

True

In [3]:


@dataclass
class SASCodeChunk:
    """Structure to hold parsed SAS code chunks"""
    chunk_type: str  # 'proc', 'macro', 'data_step', 'other'
    code: str
    comments: str
    name: Optional[str] = None
    line_start: int = 0
    line_end: int = 0
    filepath: str = ""  # NEW: Source file path
    filename: str = ""  # NEW: Just the filename
    explanation: Optional[str] = None

def parse_sas_file(filepath: str) -> List[SASCodeChunk]:
    """
    Parse a single SAS file and extract code chunks
    Macros are treated as single chunks including all PROCs/DATA steps inside
    """
    # Get filename for metadata
    filename = Path(filepath).name
    
    # Regex patterns
    proc_pattern = re.compile(r'^\s*proc\s+(\w+)', re.IGNORECASE)
    macro_pattern = re.compile(r'^\s*%macro\s+(\w+)', re.IGNORECASE)
    macro_end_pattern = re.compile(r'^\s*%mend', re.IGNORECASE)
    data_step_pattern = re.compile(r'^\s*data\s+', re.IGNORECASE)
    
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    chunks = []
    current_chunk = []
    current_comments = []
    chunk_type = None
    chunk_name = None
    start_line = 0
    inside_macro = False
    
    for i, line in enumerate(lines):
        # Collect comments
        if line.strip().startswith('*') or line.strip().startswith('/*'):
            current_comments.append(line.strip())
            continue
        
        # Detect MACRO start
        if macro_pattern.match(line):
            if current_chunk:
                chunks.append(SASCodeChunk(
                    chunk_type=chunk_type or 'other',
                    code=''.join(current_chunk),
                    comments='\n'.join(current_comments),
                    name=chunk_name,
                    line_start=start_line,
                    line_end=i-1,
                    filepath=filepath,
                    filename=filename
                ))
            
            chunk_type = 'macro'
            chunk_name = macro_pattern.match(line).group(1)
            current_chunk = [line]
            start_line = i
            current_comments = []
            inside_macro = True
            continue
        
        # Detect MACRO end
        if macro_end_pattern.match(line):
            if inside_macro:
                current_chunk.append(line)
                chunks.append(SASCodeChunk(
                    chunk_type='macro',
                    code=''.join(current_chunk),
                    comments='\n'.join(current_comments),
                    name=chunk_name,
                    line_start=start_line,
                    line_end=i,
                    filepath=filepath,
                    filename=filename
                ))
                current_chunk = []
                current_comments = []
                chunk_type = None
                chunk_name = None
                inside_macro = False
            continue
        
        # If inside macro, just add lines
        if inside_macro:
            current_chunk.append(line)
            continue
        
        # Outside macro - detect PROC
        if proc_pattern.match(line):
            if current_chunk:
                chunks.append(SASCodeChunk(
                    chunk_type=chunk_type or 'other',
                    code=''.join(current_chunk),
                    comments='\n'.join(current_comments),
                    name=chunk_name,
                    line_start=start_line,
                    line_end=i-1,
                    filepath=filepath,
                    filename=filename
                ))
            
            chunk_type = 'proc'
            chunk_name = proc_pattern.match(line).group(1)
            current_chunk = [line]
            start_line = i
            current_comments = []
        
        # Outside macro - detect DATA step
        elif data_step_pattern.match(line):
            if current_chunk:
                chunks.append(SASCodeChunk(
                    chunk_type=chunk_type or 'other',
                    code=''.join(current_chunk),
                    comments='\n'.join(current_comments),
                    name=chunk_name,
                    line_start=start_line,
                    line_end=i-1,
                    filepath=filepath,
                    filename=filename
                ))
            
            chunk_type = 'data_step'
            chunk_name = None
            current_chunk = [line]
            start_line = i
            current_comments = []
        
        else:
            if chunk_type:
                current_chunk.append(line)
        
        # Detect chunk end for PROC/DATA
        if not inside_macro and line.strip().lower() in ['run;', 'quit;']:
            if current_chunk and chunk_type != 'macro':
                chunks.append(SASCodeChunk(
                    chunk_type=chunk_type or 'other',
                    code=''.join(current_chunk),
                    comments='\n'.join(current_comments),
                    name=chunk_name,
                    line_start=start_line,
                    line_end=i,
                    filepath=filepath,
                    filename=filename
                ))
                current_chunk = []
                current_comments = []
                chunk_type = None
                chunk_name = None
    
    # Handle remaining chunk
    if current_chunk:
        chunks.append(SASCodeChunk(
            chunk_type=chunk_type or 'other',
            code=''.join(current_chunk),
            comments='\n'.join(current_comments),
            name=chunk_name,
            line_start=start_line,
            line_end=len(lines)-1,
            filepath=filepath,
            filename=filename
        ))
    
    return chunks

In [None]:


# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-02-15-preview"),
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"]
)
deployment_name = os.environ["AZURE_OPENAI_DEPLOYMENT"]

def get_chunk_explanation(chunk: SASCodeChunk, full_file_content: str) -> str:
    """Get explanation for a code chunk using Azure OpenAI"""
    
    prompt = f"""You are a SAS programming expert. Analyze this SAS code chunk and provide a clear explanation.

FULL FILE CONTEXT:
```sas
{full_file_content}
```

SPECIFIC CHUNK TO EXPLAIN:
Type: {chunk.chunk_type}
Name: {chunk.name or 'N/A'}
Lines: {chunk.line_start}-{chunk.line_end}

Code:
```sas
{chunk.code}
```

Provide a concise explanation covering:
1. What this chunk does
2. Key operations or transformations
3. How it fits in the overall context"""

    response = client.chat.completions.create(
        model=deployment_name,
        messages=[
            {"role": "system", "content": "You are a SAS programming expert who explains code clearly and concisely."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=500
    )
    
    return response.choices[0].message.content

# Process all SAS files in folder
folder_path = 'small_repo'
all_chunks = []

sas_files = list(Path(folder_path).rglob('*.sas'))
print(f"Found {len(sas_files)} SAS files\n")

for file_idx, sas_file in enumerate(sas_files):
    print(f"\n{'#'*60}")
    print(f"FILE {file_idx+1}/{len(sas_files)}: {sas_file.name}")
    print(f"{'#'*60}")
    
    # Read full file content
    with open(sas_file, 'r', encoding='utf-8') as f:
        full_file_content = f.read()
    
    # Parse chunks
    chunks = parse_sas_file(str(sas_file))
    print(f"Found {len(chunks)} chunks in this file")
    
    # Process each chunk
    for i, chunk in enumerate(chunks):
        print(f"\n  Chunk {i+1}/{len(chunks)}: {chunk.chunk_type} - {chunk.name}")
        
        try:
            chunk.explanation = get_chunk_explanation(chunk, full_file_content)
            print(f"  ✓ Explanation added")
        except Exception as e:
            print(f"  ✗ Error: {e}")
            chunk.explanation = None
    
    all_chunks.extend(chunks)



Found 4 SAS files


############################################################
FILE 1/4: d0_dm.sas
############################################################
Found 20 chunks in this file

  Chunk 1/20: data_step - None
  ✓ Explanation added

  Chunk 2/20: proc - sort
  ✓ Explanation added

  Chunk 3/20: proc - sort
  ✓ Explanation added

  Chunk 4/20: data_step - None
  ✓ Explanation added

  Chunk 5/20: data_step - None
  ✓ Explanation added

  Chunk 6/20: proc - sort
  ✓ Explanation added

  Chunk 7/20: data_step - None
  ✓ Explanation added

  Chunk 8/20: proc - sort
  ✓ Explanation added

  Chunk 9/20: data_step - None
  ✓ Explanation added

  Chunk 10/20: data_step - None
  ✓ Explanation added

  Chunk 11/20: proc - sort
  ✓ Explanation added

  Chunk 12/20: data_step - None
  ✓ Explanation added

  Chunk 13/20: proc - sort
  ✓ Explanation added

  Chunk 14/20: data_step - None
  ✓ Explanation added

  Chunk 15/20: data_step - None
  ✓ Explanation added

  Chunk 16/20: proc - 

NameError: name 'chunks_to_json' is not defined

In [7]:

def chunks_to_json(chunks: List[SASCodeChunk], output_file: str = None):
    """
    Convert SAS code chunks to JSON
    
    Args:
        chunks: List of SASCodeChunk objects
        output_file: Optional filepath to save JSON (if None, returns string)
    
    Returns:
        JSON string if output_file is None, otherwise writes to file
    """
    # Convert dataclasses to dictionaries
    chunks_dict = [asdict(chunk) for chunk in chunks]
    
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(chunks_dict, f, indent=2, ensure_ascii=False)
        print(f"Saved {len(chunks)} chunks to {output_file}")
    else:
        return json.dumps(chunks_dict, indent=2, ensure_ascii=False)

In [8]:
# Save all chunks to JSON
chunks_to_json(all_chunks, 'chunk_explanation_12052025.json')
print(f"\n\nTotal: Processed {len(all_chunks)} chunks from {len(sas_files)} files")

Saved 46 chunks to chunk_explanation_12052025.json


Total: Processed 46 chunks from 4 files
