In [7]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
from IPython.display import display
from statsmodels.stats.contingency_tables import Table2x2
import warnings

In [None]:
# =============================================================================
# Block 1: Helper Functions
# =============================================================================

def compare_acceptance_rates(series_group1, series_group2, alpha=0.05, label_1='group1', label_2='group2'):
    """
    Compares two proportions (acceptance rates) using a Two-Proportion Z-Test.
    
    Parameters:
    - series_group1 (pd.Series): Boolean or 0/1 series for group 1 (e.g., Agents).
    - series_group2 (pd.Series): Boolean or 0/1 series for group 2 (e.g., Humans).
    - alpha (float): Significance level.
    - label_1 (str): Label for group 1.
    - label_2 (str): Label for group 2.

    Returns:
    - (pd.DataFrame): DataFrame containing statistical results.
    """
    
    # Handle cases where a group might be empty
    if series_group1.empty or series_group2.empty:
        print(f"Warning: Group {label_1} or {label_2} is empty. Z-Test not performed.")
        return None

    count = [series_group1.sum(), series_group2.sum()]
    nobs = [len(series_group1), len(series_group2)]
    
    # Avoid division by zero if a group has no observations
    if nobs[0] == 0 or nobs[1] == 0:
        print(f"Warning: Group {label_1} ({nobs[0]} obs) or {label_2} ({nobs[1]} obs) has no observations. Z-Test not performed.")
        return None

    # Handle value error if both groups have 0% or 100% acceptance
    if (count[0] == 0 and count[1] == 0) or (count[0] == nobs[0] and count[1] == nobs[1]):
         print(f"Warning: Both groups have the same rate (0% or 100%). Z-Test not performed.")
         return None

    stat, pval = proportions_ztest(count, nobs)
    
    rate_group1 = count[0] / nobs[0]
    rate_group2 = count[1] / nobs[1]
    
    significant = pval < alpha
    
    if significant:
        interpretation = label_1 + " significantly worse" if stat < 0 else label_1 + " significantly better"
    else:
        interpretation = "No significant difference"
        
    result = {
        f"{label_1}_accept_rate": rate_group1,
        f"{label_2}_accept_rate": rate_group2,
        "z_stat": stat,
        "p_value": pval,
        "significant": significant,
        "interpretation": interpretation,
        "absolute_difference": rate_group1 - rate_group2,
    }
    
    return pd.DataFrame([result])

def cliffs_delta(x, y):
    """
    Calculates Cliff's Delta (d). Returns a value between -1 and 1.
    """
    x = np.asarray(x)
    y = np.asarray(y)
    
    if len(x) == 0 or len(y) == 0:
        return np.nan
        
    # Create a comparison matrix
    comparisons = np.sign(x[:, None] - y)
    
    # Calculate the mean of these comparisons
    d = np.mean(comparisons)
    
    return d

def compare_continuous_non_parametric(group1_series, group2_series, group1_name="Group 1", group2_name="Group 2"):
    """
    Compares two independent non-parametric samples using Mann-Whitney U
    and calculates effect size using Cliff's Delta.
    """
    
    if group1_series.empty or group2_series.empty:
        print("Not enough data in one of the groups for Mann-Whitney U comparison.")
        return None

    # 1. Mann-Whitney U Test
    stat, p_value = stats.mannwhitneyu(group1_series, group2_series)
    print(f"Mann-Whitney U P-value: {p_value}")

    # 2. Effect Size (Cliff's Delta)
    delta = cliffs_delta(group1_series, group2_series)

    # Magnitude Interpretation
    abs_delta = abs(delta)
    if abs_delta < 0.147:
        magnitude = "Negligible"
    elif abs_delta < 0.33:
        magnitude = "Small"
    elif abs_delta < 0.474:
        magnitude = "Medium"
    else:
        magnitude = "Large"

    print(f"\\n--- Effect Size (Cliff's Delta) ---")
    print(f"Cliff's Delta: {delta:.4f}")
    print(f"Interpretation: The difference size is considered '{magnitude}'.")

    # Direction Interpretation
    if delta < 0:
        print(f"(The first group, '{group1_name}', tends to have LOWER values than the second, '{group2_name}')")
    elif delta > 0:
         print(f"(The first group, '{group1_name}', tends to have HIGHER values than the second, '{group2_name}')")
    else:
        print("(The groups are perfectly overlapping)")
        
    return {
        "mw_stat": stat,
        "mw_p_value": p_value,
        "cliffs_delta": delta,
        "effect_size_magnitude": magnitude
    }

def check_normality(data_series):
    """
    Checks the normality of a data series using the Shapiro-Wilk test.
    """
    data = data_series.dropna()
    
    if len(data) > 5000:
        print("Warning: Over 5000 samples. Shapiro-Wilk p-value might not be accurate (using sample of 5000).")
        data = data.sample(5000)

    if len(data) < 3:
        print("Not enough data for normality test.")
        return None, "Insufficient"

    stat, p_value = stats.shapiro(data)
    
    print(f"Shapiro-Wilk Test P-value: {p_value}")
    if p_value > 0.05:
        print("Data appears to be normally distributed.")
        return p_value, "Normal"
    else:
        print("Data is NOT normally distributed.")
        return p_value, "Not Normal"

def analyze_odds_ratio(grouped_df, agent_label='Group1', human_label='Group2'):
    """
    Calculates Odds Ratio from a grouped DataFrame.
    """
    try:
        a = grouped_df.loc[agent_label, 'accepted_count'] # Group 1 Accepted
        b = grouped_df.loc[agent_label, 'rejected_count'] # Group 1 Rejected
        c = grouped_df.loc[human_label, 'accepted_count'] # Group 2 Accepted
        d = grouped_df.loc[human_label, 'rejected_count'] # Group 2 Rejected
    except KeyError as e:
        print(f"Error: Label not found in grouped DataFrame: {e}")
        return None

    if b == 0 or c == 0:
        print(f"Error: Imminent division by zero (count 0 in '{agent_label}_rejected' or '{human_label}_accepted').")
        odds_ratio = np.inf if c == 0 else 0
        inverse_or = 1 / odds_ratio if odds_ratio != 0 else np.inf
    else:
        odds_ratio = (a * d) / (b * c)
        inverse_or = 1 / odds_ratio

    print(f"Odds Ratio ({agent_label} vs {human_label}): {odds_ratio:.4f}")
    print(f"Inverse Odds Ratio ({human_label} vs {agent_label}): {inverse_or:.4f}")
    print(f"\\nInterpretation: The odds of '{human_label}' being accepted are {inverse_or:.2f} times higher than '{agent_label}'.")
    
    return {
        "odds_ratio": odds_ratio,
        "inverse_odds_ratio": inverse_or
    }

In [None]:
# =============================================================================
# Block 2: THE NEW MASTER FUNCTION
# =============================================================================

def run_full_comparison(df, group1_mask, group2_mask, 
                        group1_label, group2_label, 
                        analysis_title="Comparative Analysis",
                        acceptance_col='accepted',
                        state_col='state',
                        closed_value='closed',
                        created_at_col='created_at',
                        closed_at_col='closed_at'):
    """
    Executes a full comparative analysis (Acceptance Rate, Odds Ratio, Duration)
    between two groups defined by boolean masks.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - group1_mask (pd.Series): Boolean mask for Group 1 (e.g., df['user_type'] == 'Human').
    - group2_mask (pd.Series): Boolean mask for Group 2 (e.g., df['user_type'] != 'Human').
    - group1_label (str): Label for Group 1 (e.g., 'Human').
    - group2_label (str): Label for Group 2 (e.g., 'Agent').
    - analysis_title (str): Title for the analysis output.
    - ... (column names)
    """
    
    print(f"=========================================================")
    print(f"STARTING: {analysis_title}")
    print(f"=========================================================\\n")
    
    all_results = {}
    
    # --- 1. Acceptance Rate Analysis (Z-Test) ---
    print(f"--- 1. Acceptance Rate Analysis ({group2_label} vs {group1_label}) ---")
    
    # Ensure we are using only data from the original DataFrame
    accepted_g1 = df.loc[group1_mask, acceptance_col]
    accepted_g2 = df.loc[group2_mask, acceptance_col]
    
    # Pass g2 (e.g., Agent) as label_1 and g1 (e.g., Human) as label_2
    acceptance_results_df = compare_acceptance_rates(
        accepted_g2, accepted_g1, 
        label_1=group2_label, label_2=group1_label
    )
    
    if acceptance_results_df is not None:
        style_format = {
            f"{group2_label}_accept_rate": "{:.2%}",
            f"{group1_label}_accept_rate": "{:.2%}",
            "z_stat": "{:.2f}",
            "p_value": "{:.2e}",
            "absolute_difference": "{:.2%}",
        }
        display(acceptance_results_df.style.format(style_format))
        all_results['acceptance'] = acceptance_results_df
    else:
        print("Could not calculate acceptance rate.\\n")
        all_results['acceptance'] = None

    print("\\n")

    # --- 2. Odds Ratio Analysis ---
    print(f"--- 2. Odds Ratio Analysis ({group2_label} vs {group1_label}) ---")
    g1_accepted = accepted_g1.sum()
    g1_rejected = len(accepted_g1) - g1_accepted
    
    g2_accepted = accepted_g2.sum()
    g2_rejected = len(accepted_g2) - g2_accepted

    data = {
        'accepted_count': [g2_accepted, g1_accepted],
        'rejected_count': [g2_rejected, g1_rejected]
    }
    grouped_df = pd.DataFrame(data, index=[group2_label, group1_label])
    
    print("Grouped Data:")
    display(grouped_df)
    
    odds_results = analyze_odds_ratio(
        grouped_df, 
        agent_label=group2_label, 
        human_label=group1_label
    )
    all_results['odds_ratio'] = odds_results
    print("\\n")

    # --- 3. Duration Analysis (Mann-Whitney U) ---
    print(f"--- 3. Closed PR Duration Analysis ({group2_label} vs {group1_label}) ---")
    
    # Filter for closed PRs
    closed_df = df[df[state_col] == closed_value].copy()
    
    if closed_df.empty:
        print("No closed PRs available to analyze duration.\\n")
        all_results['duration'] = None
    else:
        # Calculate duration
        closed_df['created_at_dt'] = pd.to_datetime(closed_df[created_at_col], errors='coerce')
        closed_df['closed_at_dt'] = pd.to_datetime(closed_df[closed_at_col], errors='coerce')
        closed_df['pr_duration(h)'] = (closed_df['closed_at_dt'] - closed_df['created_at_dt']).dt.total_seconds() / 3600
        
        # Check normality (on the total set of closed durations)
        print("Normality Check (Total Duration of Closed PRs):")
        check_normality(closed_df['pr_duration(h)'].dropna())
        print("\\n")

        # Apply masks to the filtered dataframe
        # Use .index.intersection to ensure we only get IDs present in closed_df
        g1_indices = closed_df.index.intersection(group1_mask[group1_mask].index)
        g2_indices = closed_df.index.intersection(group2_mask[group2_mask].index)

        duration_g1 = closed_df.loc[g1_indices, 'pr_duration(h)'].dropna()
        duration_g2 = closed_df.loc[g2_indices, 'pr_duration(h)'].dropna()

        if duration_g1.empty or duration_g2.empty:
            print("Not enough duration data for one or both groups.\n")
            all_results['duration'] = None
        else:
            print(f"Comparing Duration: {group2_label} ({len(duration_g2)} PRs) vs {group1_label} ({len(duration_g1)} PRs)")
            duration_results = compare_continuous_non_parametric(
                duration_g2,  # g2 (e.g., Agent) first
                duration_g1,  # g1 (e.g., Human) second
                group2_label, 
                group1_label
            )
            all_results['duration'] = duration_results
    
    print(f"\\n--- END OF ANALYSIS: {analysis_title} ---\n")
    return all_results

In [None]:
# =============================================================================
# Block 3: Usage Example
# =============================================================================

# 1. Load and prepare your data
#    !!!! IMPORTANT: Replace the file path with the correct one. !!!!
path_to_file = r'output_files\\fix_prs_with_issues_and_files_and_tests.parquet'
fixes_with_issues = pd.read_parquet(path_to_file)

# Basic Preparation
fixes_with_issues.rename(columns={'agent': 'user_type'}, inplace=True)
fixes_with_issues['accepted'] = fixes_with_issues['merged_at'].notnull().astype(int)

closed_prs = fixes_with_issues[fixes_with_issues['state'] != 'open']

print(f"DataFrame loaded successfully: {len(closed_prs)} rows.")

# -------------------------------------------------------------------------
# Scenario 1: Agents vs Humans (in all 'fix' PRs)
# -------------------------------------------------------------------------

# Define masks for groups
mask_human = closed_prs['user_type'] == 'Human'
mask_agent = closed_prs['user_type'] != 'Human'

# Call master function
results_agent_vs_human = run_full_comparison(
    df=closed_prs, 
    group1_mask=mask_human, 
    group2_mask=mask_agent, 
    group1_label='Human', 
    group2_label='Agents', 
    analysis_title="Analysis 1: Agents vs Humans (All 'fix' PRs)"
)

In [None]:
# -------------------------------------------------------------------------
# Scenario 2: With Test vs Without Test (in all 'fix' PRs)
# -------------------------------------------------------------------------

# Define masks
# Note: 'has_modified_test' is boolean
mask_with_test = closed_prs['has_modified_test'] == True
mask_without_test = closed_prs['has_modified_test'] == False

# Call master function
# Note: 'PRs without tests' is group 1 (human_label/control equivalent)
# and 'PRs with tests' is group 2 (agent_label equivalent)
results_test_vs_notest = run_full_comparison(
    df=closed_prs,
    group1_mask=mask_without_test,
    group2_mask=mask_with_test,
    group1_label='PRs without tests',
    group2_label='PRs with tests',
    analysis_title="Analysis 2: With Test vs Without Test (All 'fix' PRs)"
)

In [None]:
# -------------------------------------------------------------------------
# Scenario 3: Agents vs Humans (ONLY in PRs WITH TESTS)
# -------------------------------------------------------------------------

# 1. Create filtered DataFrame first
df_with_tests_only = closed_prs[closed_prs['has_modified_test'] == True].copy()
print(f"\\nFiltering for PRs with tests: {len(df_with_tests_only)} rows.")

# 2. Define masks on THIS NEW DataFrame
mask_human_wt = df_with_tests_only['user_type'] == 'Human'
mask_agent_wt = df_with_tests_only['user_type'] != 'Human'

# 3. Call master function
results_agent_vs_human_wt = run_full_comparison(
    df=df_with_tests_only,
    group1_mask=mask_human_wt,
    group2_mask=mask_agent_wt,
    group1_label='Human',
    group2_label='Agents',
    analysis_title="Analysis 3: Agents vs Humans (Only 'fix' PRs WITH TESTS)"
)

In [None]:
# -------------------------------------------------------------------------
# Scenario 4: With Issues vs Without Issues (in all 'fix' PRs)
# -------------------------------------------------------------------------

fixes_with_linked_issues = closed_prs[closed_prs['has_issues'] == True]

# Define masks
mask_Human = fixes_with_linked_issues['user_type'] == 'Human'
mask_Agents = fixes_with_linked_issues['user_type'] != 'Human'

# Call master function
results_issues = run_full_comparison(
    df=fixes_with_linked_issues,
    group1_mask=mask_Human,
    group2_mask=mask_Agents,
    group1_label='Human PRs',
    group2_label='Agent PRs',
    analysis_title="Analysis 4: With Linked Issues - Agents vs Humans (All 'fix' PRs)"
)

In [None]:

print("\\n=========================================================")
print("STARTING: Analysis 5: Acceptance Rate by Agent vs. Humans")
print("=========================================================\\n")

# 1. Get Human acceptance series
human_acceptance = closed_prs.loc[mask_human, 'accepted']

# 2. Get unique agent types list
agent_types = closed_prs[~mask_human]['user_type'].unique()

print(f"Comparing {len(agent_types)} agent types against 'Human' ({len(human_acceptance)} PRs)...")

results_by_agent = []

# 3. Iterate through each agent type
for agent_name in agent_types:
    mask_current_agent = closed_prs['user_type'] == agent_name
    agent_acceptance = closed_prs.loc[mask_current_agent, 'accepted']
    
    print(f"\\n--- Comparing {agent_name} ({len(agent_acceptance)} PRs) vs Human ---")
    
    # 4. Call ORIGINAL function (from Block 1)
    result_df = compare_acceptance_rates(
        agent_acceptance,
        human_acceptance,
        label_1=agent_name,
        label_2='Human'
    )
    
    if result_df is not None:
        result_df['agent_type'] = agent_name
        result_df['agent_PRs'] = len(agent_acceptance)
        result_df['agent_accepted_count'] = agent_acceptance.sum()
        results_by_agent.append(result_df)

# 5. Concatenate and process results
if results_by_agent:
    final_results_df = pd.concat(results_by_agent, ignore_index=True, sort=False)
    
    # a. Find all agent rate columns
    agent_rate_cols = [col for col in final_results_df.columns if 'accept_rate' in col and 'Human' not in col]
    
    # b. Create unique 'agent_accept_rate' column by summing values
    final_results_df['agent_accept_rate'] = final_results_df[agent_rate_cols].sum(axis=1)
    final_results_df['agent_rejected_count'] = final_results_df['agent_PRs'] - final_results_df['agent_accepted_count']
    
    # c. Drop original dynamic agent rate columns
    final_results_df = final_results_df.drop(columns=agent_rate_cols)

    # 6. Define final column order
    cols_order = [
        'agent_type', 
        'agent_PRs', 
        'agent_accepted_count',
        'agent_rejected_count',
        'agent_accept_rate',
        'Human_accept_rate',
        'z_stat', 
        'p_value', 
        'significant', 
        'interpretation', 
        'absolute_difference'
    ]
    
    # Filter for columns that actually exist
    final_cols = [col for col in cols_order if col in final_results_df.columns]

    style_format = {
        'agent_accept_rate': "{:.2%}",
        'Human_accept_rate': "{:.2%}",
        "z_stat": "{:.2f}",
        "p_value": "{:.2e}",
        "absolute_difference": "{:.2%}",
    }
    
    print("\\n\\n--- Consolidated Result: Acceptance by Agent vs. Humans ---")
    
    display(final_results_df[final_cols].sort_values('z_stat').style.format(style_format))
else:
    print("\\nNo agent comparison could be completed.")

In [None]:
# =============================================================================
# Block 8: 6x6 Comparison Matrix (Odds Ratio + Confidence Interval)
# =============================================================================

print("\\n=========================================================")
print("STARTING: Analysis 6: 6x6 Comparison Matrices (with 95% CI)")
print("=========================================================\\n")

# 1. Get full list of user types (Human first)
human_mask = closed_prs['user_type'] == 'Human'
agent_types = sorted(list(closed_prs[~human_mask]['user_type'].unique()))
user_types_list = ['Human'] + agent_types

print(f"Comparing the following groups: {user_types_list}")

# 2. Pre-calculate counts
try:
    grouped_counts = closed_prs.groupby('user_type')['accepted'].agg(
        accepted_count=lambda x: x.sum(),
        total_count=lambda x: x.count()
    )
    grouped_counts['rejected_count'] = grouped_counts['total_count'] - grouped_counts['accepted_count']

    # 3. Initialize matrices
    df_odds_ratios = pd.DataFrame(np.nan, index=user_types_list, columns=user_types_list, dtype=float)
    df_p_values = pd.DataFrame(np.nan, index=user_types_list, columns=user_types_list, dtype=float)
    
    # New matrices for Confidence Interval
    df_ci_lower = pd.DataFrame(np.nan, index=user_types_list, columns=user_types_list, dtype=float)
    df_ci_upper = pd.DataFrame(np.nan, index=user_types_list, columns=user_types_list, dtype=float)
    df_ci_formatted = pd.DataFrame("", index=user_types_list, columns=user_types_list, dtype=object)

    # 4. Iterate through each pair
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        for user_X in user_types_list: # Interest Group
            for user_Y in user_types_list: # Reference Group
                
                # Diagonal
                if user_X == user_Y:
                    df_odds_ratios.loc[user_X, user_Y] = 1.0
                    df_p_values.loc[user_X, user_Y] = 1.0
                    df_ci_lower.loc[user_X, user_Y] = 1.0
                    df_ci_upper.loc[user_X, user_Y] = 1.0
                    df_ci_formatted.loc[user_X, user_Y] = "[1.00, 1.00]"
                    continue
                
                try:
                    # Build 2x2 table
                    a = grouped_counts.loc[user_X, 'accepted_count']
                    b = grouped_counts.loc[user_X, 'rejected_count']
                    c = grouped_counts.loc[user_Y, 'accepted_count']
                    d = grouped_counts.loc[user_Y, 'rejected_count']
                    
                    contingency_table = [[a, b], [c, d]]
                    ct = Table2x2(contingency_table)
                    
                    # Fill OR and P-Value
                    df_odds_ratios.loc[user_X, user_Y] = ct.oddsratio
                    df_p_values.loc[user_X, user_Y] = ct.oddsratio_pvalue()
                    
                    # --- NEW: Calculate Confidence Interval (95%) ---
                    ci_low, ci_upp = ct.oddsratio_confint(alpha=0.05)
                    
                    df_ci_lower.loc[user_X, user_Y] = ci_low
                    df_ci_upper.loc[user_X, user_Y] = ci_upp
                    
                    # Formatting for friendly display: "[Low, High]"
                    df_ci_formatted.loc[user_X, user_Y] = f"[{ci_low:.2f}, {ci_upp:.2f}]"

                except Exception as e:
                    print(f"Error comparing {user_X} vs {user_Y}: {e}")
                    
    # 7. Display results
    print("\n--- Odds Ratios Matrix ---")
    display(df_odds_ratios.style.format("{:.3f}").background_gradient(cmap='coolwarm', vmin=0, vmax=2))

    print("\n--- P-Values Matrix (Bold if < 0.05) ---")
    display(df_p_values.style.format("{:.2e}").map(lambda x: 'font-weight: bold' if x < 0.05 else ''))

    print("\n--- 95% Confidence Intervals Matrix ---")
    print("Interpretation: If the interval crosses 1.0 (e.g., [0.8, 1.2]), the association is not statistically significant.")
    display(df_ci_formatted)

except Exception as e:
    print(f"A general error occurred: {e}")

In [None]:
display(final_results_df[['agent_type','agent_accept_rate','Human_accept_rate','z_stat',
                          'p_value','significant','interpretation']].sort_values('z_stat').style.format(style_format))