In [14]:
import pandas as pd
from collections import Counter

In [15]:
def analyze_frequencies(total_df: pd.DataFrame, refactoring_df: pd.DataFrame):
    """
    Analyze frequencies of refactorings and performance changes
    """
    # Initialize results dictionary
    frequencies = {
        "total_dataset": {},
        "refactoring_dataset": {},
        "overlap_analysis": {},
    }

    # 1. Total Dataset Frequencies
    total_methods = total_df["method_id"].nunique()
    total_commits = total_df["commit_id"].nunique()
    total_projects = total_df["project_id"].nunique()

    frequencies["total_dataset"] = {
        "total_methods": total_methods,
        "total_commits": total_commits,
        "total_projects": total_projects,
        "methods_per_commit": total_methods / total_commits if total_commits > 0 else 0,
        "commits_per_project": (
            total_commits / total_projects if total_projects > 0 else 0
        ),
    }

    # 2. Refactoring Dataset Frequencies
    refactoring_methods = refactoring_df["method_id"].nunique()
    refactoring_commits = refactoring_df["commit_id"].nunique()
    refactoring_projects = refactoring_df["project_id"].nunique()

    # Count methods with performance changes
    perf_changes = refactoring_df[refactoring_df["effect_size"] != 0]
    methods_with_perf_changes = perf_changes["method_id"].nunique()

    frequencies["refactoring_dataset"] = {
        "methods_with_refactorings": refactoring_methods,
        "methods_with_perf_changes": methods_with_perf_changes,
        "commits_with_refactorings": refactoring_commits,
        "projects_with_refactorings": refactoring_projects,
    }

    # 4. Calculate Overlap Frequencies
    frequencies["overlap_analysis"] = {
        "method_level": {
            "percentage": (
                (refactoring_methods / total_methods * 100) if total_methods > 0 else 0
            ),
            "frequency": (
                refactoring_methods / total_methods if total_methods > 0 else 0
            ),
        },
        "commit_level": {
            "percentage": (
                (refactoring_commits / total_commits * 100) if total_commits > 0 else 0
            ),
            "frequency": (
                refactoring_commits / total_commits if total_commits > 0 else 0
            ),
        },
        "project_level": {
            "percentage": (
                (refactoring_projects / total_projects * 100)
                if total_projects > 0
                else 0
            ),
            "frequency": (
                refactoring_projects / total_projects if total_projects > 0 else 0
            ),
        },
    }

    return frequencies

In [16]:
def generate_frequency_report(frequencies: dict):
    """
    Generate a detailed frequency report
    """
    report = []
    report.append("Frequency Analysis of Refactorings and Performance Changes")
    report.append("=" * 60)

    # 1. Base Statistics
    report.append("\n1. Base Performance Dataset Frequencies:")
    report.append(f"Total Methods: {frequencies['total_dataset']['total_methods']}")
    report.append(f"Total Commits: {frequencies['total_dataset']['total_commits']}")
    report.append(f"Total Projects: {frequencies['total_dataset']['total_projects']}")

    # 2. Refactoring and Performance Change Frequencies
    report.append("\n2. Performance Dataset With Refactorings Frequencies:")
    rd = frequencies["refactoring_dataset"]
    report.append(f"Total Methods: {rd['methods_with_refactorings']}")
    report.append(f"Total Commits: {rd['commits_with_refactorings']}")
    report.append(f"Total Projects: {rd['projects_with_refactorings']}")

    # 3. Overlap Analysis
    report.append("\n3. Overlap Analysis:")
    for level, stats in frequencies["overlap_analysis"].items():
        report.append(f"\n{level.replace('_', ' ').title()}:")
        report.append(f"Percentage: {stats['percentage']:.2f}%")

    return "\n".join(report)

In [None]:
# Load data
total_performance_path = "../data/performance_dataset.csv"
refactoring_performance_path = "../data/dataset_with_refactorings.csv"

total_df = pd.read_csv(total_performance_path)
refactoring_df = pd.read_csv(refactoring_performance_path)

# Create unique method identifiers
total_df["method_id"] = (
    total_df["project_id"]
    + "/"
    + total_df["commit_id"]
    + "/"
    + total_df["method_name"]
)
refactoring_df["method_id"] = (
    refactoring_df["project_id"]
    + "/"
    + refactoring_df["commit_id"]
    + "/"
    + refactoring_df["method_name"]
)

# Perform frequency analysis
frequencies = analyze_frequencies(total_df, refactoring_df)

# Generate and print report
report = generate_frequency_report(frequencies)
print(report)

In [None]:
df = pd.read_csv(refactoring_performance_path)

# Get rows with refactorings
df_refactoring = df[df["refactorings"].notna()]

# Split refactorings and count unique occurrences
refactoring_counts = {}

for refactorings in df_refactoring["refactorings"]:
    # Split by '+' and strip whitespace
    types = [r.strip() for r in str(refactorings).split("+")]
    for ref_type in types:
        refactoring_counts[ref_type] = refactoring_counts.get(ref_type, 0) + 1

# Convert to DataFrame and sort by count
results = pd.DataFrame(
    {
        "Refactoring_Type": list(refactoring_counts.keys()),
        "Count": list(refactoring_counts.values()),
    }
)
performace_refactoring_type_counts = results.sort_values("Count", ascending=True)

print(results)

In [None]:
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv("../data/all-refactorings.csv")  # Replace with your actual CSV file name

# Get the refactoring types and project IDs from the provided DataFrame
refactoring_types = df_refactoring["refactorings"]
projects = df_refactoring["project_id"]

# Convert projects and refactoring_types to sets for efficient filtering
projects_set = set(projects)
refactoring_types_set = set(refactoring_types)

# Filter the DataFrame based on projects and refactoring types
filtered_df = df[
    df["project"].isin(projects_set) & df["refactoringType"].isin(refactoring_types_set)
]

# Count occurrences of each refactoring type
refactoring_type_counts = Counter(filtered_df["refactoringType"])

# Print the counts
for refactoring_type, count in refactoring_type_counts.items():
    print(f"{refactoring_type}: {count}")

In [11]:
def calculate_odds_ratio(performance_counts, refactoring_counts):
    odds_ratios = {}
    total_perf = sum(performance_counts.values())
    total_refactor = sum(refactoring_counts.values())

    for refactoring_type, perf_count in performance_counts.items():
        refactor_count = refactoring_counts.get(refactoring_type, 0)
        if refactor_count > 0:
            odds_ratio = (perf_count / total_perf) / (refactor_count / total_refactor)
            odds_ratios[refactoring_type] = round(odds_ratio, 2)
        else:
            odds_ratios[refactoring_type] = None  # Undefined odds ratio
    return odds_ratios

In [21]:
# Data input
performance_counts = {
    "Extract Method": 61,
    "Extract Variable": 49,
    "Change Variable Type": 39,
    "Inline Variable": 13,
    "Add Variable Modifier": 7,
    "Add Parameter": 6,
    "Change Method Access Modifier": 6,
    "Remove Method Annotation": 6,
    "Add Method Annotation": 10,
    "Inline Method": 9,
    "Change Parameter Type": 4,
    "Remove Method Modifier": 2
}

refactoring_counts = {
    "Extract Method": 4223,
    "Extract Variable": 3356,
    "Change Variable Type": 8332,
    "Inline Variable": 1380,
    "Add Variable Modifier": 1553,
    "Add Parameter": 6398,
    "Change Method Access Modifier": 6387,
    "Remove Method Annotation": 3743,
    "Add Method Annotation": 10898,
    "Inline Method": 669,
    "Change Parameter Type": 8115,
    "Remove Method Modifier": 1151
}

In [None]:
# Calculate odds ratios
odds_ratios = calculate_odds_ratio(performance_counts, refactoring_counts)

# Create a DataFrame for display
df = pd.DataFrame(
    {
        "Refactoring Type": list(performance_counts.keys()),
        "Count in Performance Dataset": list(performance_counts.values()),
        "Count in Refactoring Dataset": [
            refactoring_counts[ref_type] for ref_type in performance_counts.keys()
        ],
        "Odds Ratio": [odds_ratios[ref_type] for ref_type in performance_counts.keys()],
    }
)

# Display the result
df.sort_values(by="Odds Ratio", ascending=False, inplace=True)
print(df)

# Save to CSV (optional)
df.to_csv("../results/odds_ratios.csv", index=False)