In [5]:
import json
import os
import pandas as pd
import re

def natural_sort_key(filename):
    """Extract numbers from filename for natural sorting"""
    numbers = re.findall(r'\d+', filename)
    return [int(num) for num in numbers] if numbers else [0]

def main():
    # Define the specific question order
    question_order = [
        "Two Sum",
        "Palindrome Number",
        "Valid Parentheses",
        "Merge Two Sorted Lists",
        "Best Time to Buy and Sell Stock",
        "Valid Anagram",
        "Contains Duplicate",
        "Maximum Subarray",
        "Majority Element",
        "Invert Binary Tree",
        "Climbing Stairs",
        "Symmetric Tree",
        "Binary Search",
        "Intersection of Two Arrays",
        "Missing Number",
        "Add Two Numbers",
        "Longest Substring Without Repeating Characters",
        "Group Anagrams",
        "Top K Frequent Elements",
        "Product of Array Except Self",
        "Sort Colors",
        "Kth Largest Element in an Array",
        "Find the Duplicate Number",
        "Set Matrix Zeroes",
        "Search in Rotated Sorted Array",
        "Word Break",
        "Minimum Path Sum",
        "Number of Islands",
        "Clone Graph",
        "LRU Cache",
        "Binary Tree Level Order Traversal",
        "Letter Combinations of a Phone Number",
        "Subsets",
        "Spiral Matrix",
        "Find All Anagrams in a String",
        "Merge K Sorted Lists",
        "Longest Valid Parentheses",
        "Trapping Rain Water",
        "Word Ladder",
        "Median of Two Sorted Arrays",
        "Regular Expression Matching",
        "N-Queens",
        "Minimum Window Substring",
        "Count Subtrees With Max Distance Between Cities",
        "Dice Roll Simulation",
        "Serialize and Deserialize Binary Tree",
        "Shortest Path in a Grid with Obstacles Elimination",
        "Binary Tree Maximum Path Sum",
        "Burst Balloons",
        "LFU Cache"
    ]
    
    # open all json files in the directory
    directory = os.path.join("..", "output", "leetcode", "copilot")
    
    # Get files and sort them naturally (leetcode_submissions_1, leetcode_submissions_2, etc.)
    files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    files.sort(key=natural_sort_key)
    
    full_file_paths = [os.path.join(directory, f) for f in files]
    
    print(f"Processing files in order: {files}")
    print("=" * 50)

    # get all the submissions with file source information
    submissions = []
    for file_path, filename in zip(full_file_paths, files):
        with open(file_path) as json_file:
            data = json.load(json_file)
            submissions_list = data['submissions_dump']
            for submission in submissions_list:
                submission['source_file'] = filename  # Add source file info
                submissions.append(submission)

    # Process submissions to get memory usage by question and language
    question_data = {}
    
    for submission in submissions:
        question_title = submission['title']
        language = submission['lang_name']
        memory = submission['memory']
        
        # Skip if memory is "N/A"
        if memory == "N/A":
            continue
            
        # Extract numeric value from memory string (e.g., "19.1 MB" -> 19.1)
        try:
            memory_value = float(memory.split()[0])
        except (ValueError, IndexError):
            # Skip if memory value cannot be converted
            continue
        
        if question_title not in question_data:
            question_data[question_title] = {}
        
        # Store the memory value for this language
        question_data[question_title][language] = memory_value

    # Define the order of languages for the output
    languages = ['Java', 'JavaScript', 'PHP', 'Python']
    
    # Create a list for DataFrame - only include questions that exist in our data
    table_data = []
    
    # Prepare data for DataFrame in the specified order
    for question in question_order:
        if question in question_data:
            lang_data = question_data[question]
            row = {'Question': question}
            for lang in languages:
                if lang in lang_data:
                    row[lang] = lang_data[lang]
                else:
                    row[lang] = None  # Use None for missing values
            table_data.append(row)
    
    # Add any remaining questions that are in the data but not in the predefined order
    for question in question_data:
        if question not in question_order:
            lang_data = question_data[question]
            row = {'Question': question}
            for lang in languages:
                if lang in lang_data:
                    row[lang] = lang_data[lang]
                else:
                    row[lang] = None
            table_data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(table_data)
    
    # Reorder columns to have Question first, then languages
    columns = ['Question'] + languages
    df = df[columns]
    
    # Display the table
    print("Memory Usage by Question and Language (in MB):")
    print("=" * 60)
    print(df.to_string(index=False, na_rep='N/A', float_format='%.1f'))
    
    # Display file processing summary
    print("\n" + "=" * 50)
    print("FILE PROCESSING SUMMARY:")
    print("=" * 50)
    for i, filename in enumerate(files, 1):
        file_submissions = [s for s in submissions if s['source_file'] == filename]
        print(f"{i}. {filename}: {len(file_submissions)} submissions")
    
    # Save to CSV for Excel
    output_file = "leetcode_memory_analysis.csv"
    df.to_csv(output_file, index=False, na_rep='N/A', float_format='%.1f')
    print(f"\nData saved to {output_file}")
    
    # Also save as Excel file
    try:
        excel_file = "leetcode_memory_analysis.xlsx"
        df.to_excel(excel_file, index=False, na_rep='N/A', float_format='%.1f')
        print(f"Data also saved to {excel_file}")
    except ImportError:
        print("Note: To save as Excel file, install openpyxl: pip install openpyxl")

    # Additional analysis: Show which questions came from which files
    print("\n" + "=" * 50)
    print("QUESTION DISTRIBUTION ACROSS FILES:")
    print("=" * 50)
    
    file_question_map = {}
    for submission in submissions:
        source_file = submission['source_file']
        question_title = submission['title']
        
        if source_file not in file_question_map:
            file_question_map[source_file] = set()
        file_question_map[source_file].add(question_title)
    
    for filename in sorted(files, key=natural_sort_key):
        questions = file_question_map.get(filename, set())
        print(f"\n{filename}:")
        for question in sorted(questions):
            print(f"  - {question}")

if __name__ == "__main__":
    main()

Processing files in order: ['leetcode_submissions_1.json', 'leetcode_submissions_2.json', 'leetcode_submissions_3.json', 'leetcode_submissions_4.json', 'leetcode_submissions_5.json', 'leetcode_submissions_6.json', 'leetcode_submissions_7.json', 'leetcode_submissions_8.json', 'leetcode_submissions_9.json', 'leetcode_submissions_10.json']
Memory Usage by Question and Language (in MB):
                                          Question  Java  JavaScript   PHP  Python
                                           Two Sum  45.3        56.4  21.0    12.9
                                 Palindrome Number  44.0        63.6  20.4    12.5
                                 Valid Parentheses  41.8        56.1  21.4    12.6
                            Merge Two Sorted Lists  42.6        58.1  20.7    12.5
                   Best Time to Buy and Sell Stock  61.3        66.2  27.0    19.1
                                     Valid Anagram  43.2        54.9  22.6    12.4
                                C