In [None]:
import pandas as pd
import random
import ast
import re
from typing import List, Tuple

class CodeAugmenter:
    def __init__(self):
        self.variable_names = ['data', 'items', 'numbers', 'values', 'elements',
                              'lst', 'arr', 'collection', 'seq', 'array']
        self.function_names = ['process', 'calculate', 'compute', 'execute', 'run',
                              'handle', 'perform', 'do_work', 'main', 'worker']

    def load_data(self, file_path):
        """Load the original dataset"""
        return pd.read_csv(file_path)

    def augment_variable_renaming(self, buggy_code, fixed_code):
        """Rename variables while preserving semantics"""
        try:
            # Simple variable renaming pattern
            if 'list =' in buggy_code:
                new_var = random.choice(self.variable_names)
                new_buggy = buggy_code.replace('list =', f'{new_var} =').replace('list\n', f'{new_var}\n').replace('list ', f'{new_var} ')
                new_fixed = fixed_code.replace('lst =', f'{new_var} =').replace('lst\n', f'{new_var}\n').replace('lst ', f'{new_var} ')
                return new_buggy, new_fixed
            elif 'x =' in buggy_code:
                new_var = random.choice(['data', 'items', 'values'])
                new_buggy = buggy_code.replace('x =', f'{new_var} =')
                new_fixed = fixed_code.replace('x =', f'{new_var} =')
                return new_buggy, new_fixed
        except:
            return buggy_code, fixed_code
        return buggy_code, fixed_code

    def augment_comment_addition(self, buggy_code, fixed_code):
        """Add different types of comments"""
        comment_style = random.choice([
            "# TODO: Fix this\n",
            "# FIXME: Bug here\n",
            "# Note: This needs attention\n",
            "# Issue: Syntax error\n",
            "# Bug: Missing element\n"
        ])

        new_buggy = buggy_code.replace('# Sample ID:', f'{comment_style}# Sample ID:')
        new_fixed = fixed_code.replace('# Sample ID:', f'# Fixed{comment_style[1:]}# Sample ID:')
        return new_buggy, new_fixed

    def augment_whitespace_variation(self, buggy_code, fixed_code):
        """Modify whitespace and formatting"""
        variations = [
            lambda s: s.replace(' = ', '=').replace(' == ', '=='),
            lambda s: s.replace(' =', ' = ').replace('==', ' == '),
            lambda s: re.sub(r' +', ' ', s),  # Normalize spaces
            lambda s: s.replace('\n', '\n    ')  # Add extra indentation
        ]

        variation = random.choice(variations)
        return variation(buggy_code), variation(fixed_code)

    def augment_equivalent_expression(self, buggy_code, fixed_code):
        """Replace with equivalent expressions"""
        replacements = [
            ('if n == 1', 'if 1 == n'),
            ('if x == 5', 'if 5 == x'),
            ('if (a > b)', 'if (b < a)'),
            ('for i in range(10)', 'for index in range(10)'),
            ('def factorial(n)', 'def factorial(number)')
        ]

        for old, new in replacements:
            if old in buggy_code:
                new_buggy = buggy_code.replace(old, new)
                new_fixed = fixed_code.replace(old, new.replace('=', '='))
                return new_buggy, new_fixed
        return buggy_code, fixed_code

    def augment_commit_message(self, original_message):
        """Paraphrase commit messages"""
        paraphrases = {
            "Improved readability with proper indentation": [
                "Enhanced code formatting with correct indentation",
                "Fixed indentation for better readability",
                "Adjusted code formatting standards"
            ],
            "Corrected conditional operator mistake": [
                "Fixed conditional statement error",
                "Resolved issue with conditional operator",
                "Corrected if condition syntax"
            ],
            "Resolved off-by-one error in loop": [
                "Fixed loop boundary condition",
                "Corrected loop iteration error",
                "Resolved indexing issue in loop"
            ],
            "Added missing parentheses for print function": [
                "Fixed print function syntax",
                "Added required parentheses for print",
                "Corrected print statement formatting"
            ],
            "Fixed bug in recursive function call": [
                "Resolved recursion issue",
                "Fixed recursive function error",
                "Corrected recursion implementation"
            ],
            "Refactored variable naming for clarity": [
                "Improved variable names for readability",
                "Renamed variables for better understanding",
                "Enhanced code clarity through naming"
            ],
            "Fixed syntax error due to missing colon": [
                "Resolved missing colon syntax error",
                "Added required colon for syntax",
                "Fixed colon omission in code"
            ],
            "Updated function definition to include colon": [
                "Corrected function definition syntax",
                "Added missing colon in function def",
                "Fixed function header formatting"
            ],
            "Minor syntax corrections": [
                "Small syntax fixes",
                "Minor code formatting adjustments",
                "Syntax error corrections"
            ],
            "Indentation fixed for proper code block": [
                "Corrected code block indentation",
                "Fixed indentation for code structure",
                "Adjusted spacing for proper blocks"
            ]
        }

        for key, variants in paraphrases.items():
            if key in original_message:
                return random.choice(variants)
        return original_message

    def augment_dataset(self, df, augmentations_per_sample=3):
        """Apply all augmentations to the dataset"""
        augmented_data = []

        for _, row in df.iterrows():
            original_id = row['id']

            # Add original sample
            augmented_data.append({
                'id': f"{original_id}_0",
                'augmentation_type': 'original',
                'buggy_code': row['buggy_code'],
                'fixed_code': row['fixed_code'],
                'commit_message': row['commit_message'],
                'commit_url': row['commit_url'],
                'date': row['date']
            })

            # Apply different augmentations
            augmentation_methods = [
                ('variable_renaming', self.augment_variable_renaming),
                ('comment_addition', self.augment_comment_addition),
                ('whitespace_variation', self.augment_whitespace_variation),
                ('equivalent_expression', self.augment_equivalent_expression)
            ]

            for aug_type, aug_method in augmentation_methods:
                try:
                    new_buggy, new_fixed = aug_method(row['buggy_code'], row['fixed_code'])
                    new_message = self.augment_commit_message(row['commit_message'])

                    # Only add if code was actually modified
                    if new_buggy != row['buggy_code'] or new_fixed != row['fixed_code']:
                        augmented_data.append({
                            'id': f"{original_id}_{len(augmented_data)}",
                            'augmentation_type': aug_type,
                            'buggy_code': new_buggy,
                            'fixed_code': new_fixed,
                            'commit_message': new_message,
                            'commit_url': row['commit_url'],
                            'date': row['date']
                        })
                except Exception as e:
                    print(f"Error in {aug_type} for sample {original_id}: {e}")
                    continue

        return pd.DataFrame(augmented_data)

def main():
    # Initialize augmenter
    augmenter = CodeAugmenter()

    # Load original data
    print("Loading original dataset...")
    original_df = augmenter.load_data('code_bug_fix_pairs.csv')

    # Apply augmentations
    print("Applying augmentations...")
    augmented_df = augmenter.augment_dataset(original_df, augmentations_per_sample=3)

    # Save augmented dataset
    print(f"Original samples: {len(original_df)}")
    print(f"Augmented samples: {len(augmented_df)}")
    print(f"Augmentation factor: {len(augmented_df)/len(original_df):.2f}x")

    augmented_df.to_csv('code_bug_fix_pairs_augmented.csv', index=False)
    print("Augmented dataset saved to 'code_bug_fix_pairs_augmented.csv'")

    # Save a sample for inspection
    sample_df = augmented_df.head(50)
    sample_df.to_csv('augmented_sample.csv', index=False)
    print("Sample saved to 'augmented_sample.csv'")

if __name__ == "__main__":
    main()

Loading original dataset...
Applying augmentations...
Original samples: 1000
Augmented samples: 3258
Augmentation factor: 3.26x
Augmented dataset saved to 'code_bug_fix_pairs_augmented.csv'
Sample saved to 'augmented_sample.csv'


In [None]:
# Advanced augmentation techniques
class AdvancedCodeAugmenter(CodeAugmenter):
    def augment_loop_transformation(self, buggy_code, fixed_code):
        """Transform between different loop types"""
        # for -> while transformation
        if 'for i in range(' in buggy_code:
            match = re.search(r'for i in range\((\d+)\):', buggy_code)
            if match:
                end_val = match.group(1)
                new_buggy = buggy_code.replace(
                    f'for i in range({end_val}):',
                    f'i = 0\nwhile i < {end_val}:'
                )
                new_fixed = fixed_code.replace(
                    f'for i in range({end_val}):',
                    f'i = 0\nwhile i < {end_val}:'
                )
                return new_buggy, new_fixed
        return buggy_code, fixed_code

    def augment_function_extraction(self, buggy_code, fixed_code):
        """Extract code into helper functions"""
        if 'print(' in buggy_code and 'def ' in buggy_code:
            # Simple extraction of print statements
            new_buggy = buggy_code.replace('print(', 'print_value(')
            new_fixed = fixed_code.replace('print(', 'print_value(')

            helper_func = "\ndef print_value(msg):\n    print(msg)\n"
            new_buggy += helper_func
            new_fixed += helper_func
            return new_buggy, new_fixed
        return buggy_code, fixed_code

    def augment_error_pattern_introduction(self, buggy_code, fixed_code):
        """Introduce additional common error patterns"""
        error_patterns = [
            (' = ', ' == '),  # Assignment in condition
            ('==', '='),      # Comparison as assignment
            (':', ''),        # Remove colon
            ('    ', ''),     # Remove indentation
        ]

        for pattern, replacement in error_patterns:
            if pattern in buggy_code and random.random() < 0.3:
                new_buggy = buggy_code.replace(pattern, replacement, 1)
                return new_buggy, fixed_code
        return buggy_code, fixed_code