In [8]:
# Cell: Load TheAlgorithms/Python Dataset
import os
import subprocess
from pathlib import Path
import ast

def clone_and_extract_algorithms():
    """Clone TheAlgorithms/Python and extract complex functions"""
    
    # Clone repository if not exists
    if not os.path.exists('Python'):
        subprocess.run(['git', 'clone', 'https://github.com/TheAlgorithms/Python.git'])
    
    # Find all Python files
    python_files = list(Path('Python').rglob('*.py'))
    complex_functions = []
    
    for file_path in python_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
            # Parse the file to extract functions
            tree = ast.parse(content)
            
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef):
                    # Extract function source
                    start_line = node.lineno - 1
                    end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line + 10
                    
                    lines = content.split('\n')
                    func_source = '\n'.join(lines[start_line:end_line])
                    
                    # Filter for complexity (>10 lines, has docstring)
                    if (len(func_source.split('\n')) > 10 and 
                        ast.get_docstring(node) is not None):
                        complex_functions.append({
                            'source': func_source,
                            'name': node.name,
                            'file': str(file_path),
                            'category': file_path.parent.name
                        })
                        
        except Exception as e:
            continue
    
    print(f"✅ Extracted {len(complex_functions)} complex functions")
    return complex_functions

# Extract the dataset
algorithms_dataset = clone_and_extract_algorithms()


Cloning into 'Python'...


✅ Extracted 2203 complex functions


In [9]:
# Cell: Save Functions to JSON File
import json
from datetime import datetime

def save_algorithms_to_json(algorithms_dataset, filename="python_algorithms_dataset.json"):
    """Save the extracted algorithms dataset to a JSON file"""
    
    # Add metadata to the dataset
    dataset_with_metadata = {
        "metadata": {
            "extraction_date": datetime.now().isoformat(),
            "total_functions": len(algorithms_dataset),
            "source": "TheAlgorithms/Python GitHub Repository",
            "description": "Complex Python functions with docstrings (>10 lines)",
            "categories": list(set([func['category'] for func in algorithms_dataset]))
        },
        "functions": algorithms_dataset
    }
    
    # Save to JSON file with proper formatting
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(dataset_with_metadata, f, 
                     indent=2,           # Pretty formatting
                     ensure_ascii=False, # Handle unicode characters
                     separators=(',', ': '))  # Clean separators
        
        print(f"✅ Successfully saved {len(algorithms_dataset)} functions to '{filename}'")
        print(f"📁 File size: {os.path.getsize(filename) / (1024*1024):.2f} MB")
        
        # Display some statistics
        categories = {}
        for func in algorithms_dataset:
            cat = func['category']
            categories[cat] = categories.get(cat, 0) + 1
        
        print(f"\n📊 Functions by category:")
        for cat, count in sorted(categories.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {cat}: {count} functions")
            
    except Exception as e:
        print(f"❌ Error saving to JSON: {e}")
        return False
    
    return True

# Save the dataset
save_algorithms_to_json(algorithms_dataset)


✅ Successfully saved 2203 functions to 'python_algorithms_dataset.json'
📁 File size: 2.50 MB

📊 Functions by category:
  maths: 248 functions
  binary_tree: 136 functions
  machine_learning: 114 functions
  graphs: 114 functions
  ciphers: 95 functions
  strings: 80 functions
  dynamic_programming: 73 functions
  sorts: 70 functions
  physics: 64 functions
  linked_list: 59 functions


True