**Extract co-changes across the commit history**

- co-changes of concepts
- co-changest of config file
- co-changes of config options

In [None]:
from analysis import analyze_repository
import json

repo_path = "/home/ssimon/projects/test-config-repo"
output_file = "../data/analyzed_projects/test-config-repo.json""../data/analyzed_projects/test-config-repo.json"
commit_data = analyze_repository(repo_path=repo_path, project_name="test-config-repo", get_diff=True)

# Store commit data into the output file
with open(output_file, "w", encoding="utf-8") as dest:
    json.dump(commit_data, dest, indent=4)

**Compute Co-Changed Concepts (Internally)**

In [None]:
import json
import pandas as pd
from itertools import combinations
from collections import Counter
import glob

def extract_concept_cochanges(file_path: str, repo_name: str):
    print("Extract concept co-changes from", repo_name)

    # Load the JSON file
    with open(file_path) as file:
        data = json.load(file)

    # Counter to store co-changes
    concept_counts = Counter()

    # Process each commit
    for commit in data.get('config_commit_data', []):
        
        # Skip if no network data
        if not commit.get('network_data'):
            continue

        concepts = commit['network_data'].get('concepts', [])
      
        # Generate pairs of technologies and pairs of files
        pairs = [tuple(sorted(pair)) for pair in combinations(concepts, 2)]
        
        # Count each pair
        concept_counts.update(pairs)


    # Prepare DataFrame data
    concept_rows = []
    for (concept1, concept2), count in concept_counts.items():
        concept_rows.append({
            "Co-Changed Concepts": (concept1, concept2),
            "Concept1": concept1,
            "Concept2": concept2,
            "Changed Internally": count,
            "Percentage Internally": round(count / len(data['config_commit_data']), 2)
        })

    concept_df = pd.DataFrame(concept_rows)

    concept_df.to_csv(f"../data/concept_cochanges/{repo_name}_concept_cochanges.csv", index=False)


for file_path in glob.glob(f"../data/analyzed_projects/**"):
    file_name = file_path.split("/")[-1].split(".")[0]
    extract_concept_cochanges(file_path, file_name)

**Compute Co-Changed Concepts (Globally)**

Definitions of columns
- `Across Projects` is an integer, indicating number of projects where the co-changed concepts changed as well at least once
- `Percentage globally` is an float, indicating the percentage of projects where the co-changed concepts changed as well
- `Changed globally` is an integer, indicating the total number of times the co-changed concepts changed in other projects

In [None]:
import os

def get_cochanged_concepts(target_df, other_dfs):
    """
    Analyze co-changed concepts in a target file against all other projects to compute global stats.

    :param target_df: dataframe of target project
    :param other_dfs: dataframes of all other projects
    :return target_df: updated dataframe of target project
    """
    # Initialize columns
    target_df['Across Projects'] = 0 # Number of projects where the co-changed concepts changed as well at lea
    target_df['Percentage globally'] = 0 # Percentage of projects where the co-changed concepts changed as well
    target_df['Changed globally'] = 0 # Totoal number of times the co-changed concepts changed in other projects

    for index, row in target_df.iterrows():
        concepts = row['Co-Changed Concepts']

        for other_df in other_dfs:
            # Find all rows in other_df where the option matches
            matching_rows = other_df[other_df['Co-Changed Concepts'] == concepts]
            match_count = len(matching_rows)

            if match_count > 0:
                # Increment "Across Projects" by 1 (project-level count)
                target_df.loc[index, 'Across Projects'] += 1

                # Increment "Changed globally" by the number of times the concepts changed
                target_df.loc[index, 'Changed globally'] += match_count

    target_df['Percentage globally'] = round(target_df['Across Projects'] / len(other_dfs), 2)     

    return target_df


data_dir = "../data/concept_cochanges"

# Load all CSV files from the directory into a dictionary of DataFrames
repository_files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
repository_dataframes = {file: pd.read_csv(os.path.join(data_dir, file)) for file in repository_files}

target_repo = "piggymetrics"
target_file_name = f'{target_repo}_concept_cochanges.csv'
target_df = repository_dataframes[target_file_name]

# Use all other files as comparison
other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

# Perform the analysis
updated_target_df = get_cochanged_concepts(target_df.copy(), other_dfs)

updated_target_df.head(50)

updated_target_df.to_csv(f"../data/concept_cochanges/{target_repo}_concept_cochanges.csv")

updated_target_df

**Compute co-changed config files**

In [None]:
import json
from itertools import combinations
from collections import Counter
import glob


def extract_file_cochanges(file_path: str, repo_name: str):
    print("Extract file co-changes from", repo_name)
    
    # Load the JSON file
    with open(file_path) as file:
        data = json.load(file)

    # Counter to store co-changes
    file_pair_counts = Counter()

    # Process each commit
    for commit in data.get('config_commit_data', []):

        # Skip if no network data
        if not commit.get('network_data'):
            continue

        config_files = commit['network_data'].get('config_files', [])

        # Generate pairs of files
        file_pairs = [tuple(sorted(pair)) for pair in combinations(config_files, 2)]
        
        # Count each pair
        file_pair_counts.update(file_pairs)

    # Prepare DataFrame data
    file_rows = []
    for (file1, file2), count in file_pair_counts.items():
        file_rows.append({
            "Co-Changed Artifacts": (file1, file2),
            "Artifact1": file1,
            "Artifact2": file2,
            "Changed Internally": count,
            "Percentage Internally": round(count / len(data['config_commit_data']), 2)
        })

    file_df = pd.DataFrame(file_rows)

    file_df.to_csv(f"../data/file_cochanges/{repo_name}_file_cochanges.csv", index=False)

for file_path in glob.glob(f"../data/analyzed_projects/**"):
    file_name = file_path.split("/")[-1].split(".")[0]
    extract_file_cochanges(file_path, file_name)


**Compute Co-Changed Config Files (Globally)**

Definitions of columns
- `Across Projects` is an integer, indicating number of projects where the co-changed artifacts changed as well at least once
- `Percentage globally` is an float, indicating the percentage of projects where the co-changed artifacts changed as well
- `Changed globally` is an integer, indicating the total number of times the co-changed artifacts changed in other projects

In [None]:
import os

def get_cochanged_artifacts(target_df, other_dfs):
    """
    Analyze co-changed artifacts in a target file against all other projects to compute global stats.

    :param target_df: dataframe of target project
    :param other_dfs: dataframes of all other projects
    :return target_df: updated dataframe of target project
    """
    # Initialize columns
    target_df['Across Projects'] = 0
    target_df['Changed globally'] = 0

    for index, row in target_df.iterrows():
        artifacts = row['Co-Changed Artifacts']

        for other_df in other_dfs:
            # Find all rows in other_df where the option matches
            matching_rows = other_df[other_df['Co-Changed Artifacts'] == artifacts]
            match_count = len(matching_rows)

            if match_count > 0:
                # Increment "Across Projects" by 1 (project-level count)
                target_df.loc[index, 'Across Projects'] += 1

                # Increment "Changed globally" by the number of times the artifacts changed
                target_df.loc[index, 'Changed globally'] += match_count

    target_df['Percentage globally'] = round(target_df['Across Projects'] / len(other_dfs), 2)     

    return target_df


data_dir = "../data/file_cochanges"

# Load all CSV files from the directory into a dictionary of DataFrames
repository_files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
repository_dataframes = {file: pd.read_csv(os.path.join(data_dir, file)) for file in repository_files}

target_repo = "test-config-repo"
target_file_name = f'{target_repo}_file_cochanges.csv'
target_df = repository_dataframes[target_file_name]

# Use all other files as comparison
other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

# Perform the analysis
updated_target_df = get_cochanged_artifacts(target_df.copy(), other_dfs)

updated_target_df.head(50)

updated_target_df.to_csv(f"../data/file_cochanges/{target_repo}_file_cochanges.csv")

updated_target_df

**Compute co-changed config options**

In [None]:
import json
import pandas as pd
from itertools import combinations
from collections import Counter
import glob

def extract_option_cochanges(file_path: str, repo_name: str):
    print("Extract option co-changes from", repo_name)

    # Load the JSON file
    with open(file_path) as file:
        data = json.load(file)

    # Counter to store co-changes
    option_pair_counts = Counter()

    # Process each commit
    for commit in data.get('config_commit_data', []):    
        modified_options = {}
        
        # Skip if no network data
        if not commit.get('network_data'):
            continue

        # Collect options from all files in this commit
        for file_data in commit['network_data'].get('config_files_data', []):
            for pair in file_data.get('modified_pairs', []):
                modified_options[pair['option']] = pair

        # Generate unique pairs of modified options (by option name)
        option_names = list(modified_options.keys())
        option_pairs = [tuple(sorted(pair)) for pair in combinations(option_names, 2)]

        # Count occurrences of each pair and retain full details
        for option1, option2 in option_pairs:
            pair_details = (
                tuple(sorted(modified_options[option1].items())),
                tuple(sorted(modified_options[option2].items()))
            )
            option_pair_counts[pair_details] += 1

    # Prepare DataFrame data
    rows = []
    for (option1_details, option2_details), count in option_pair_counts.items():
        # Extract details for Option1 and Option2
        opt1 = dict(option1_details)
        opt2 = dict(option2_details)
        rows.append({
            "Co-Changed Options": (opt1.get('option'), opt2.get('option')),
            "Option1": opt1.get("option"),
            "Values1": (opt1.get("prev_value"), opt1.get("curr_value")),
            "Artifact1": opt1.get("artifact"),
            "Option2": opt2.get("option"),
            "Values2": (opt2.get("prev_value"), opt2.get("curr_value")),
            "Artifact2": opt2.get("artifact"),
            "Changed Internally": count,
            "Percentage Internally": count / len(data['config_commit_data'])
        })

    # Create DataFrame
    option_df = pd.DataFrame(rows)

    option_df.to_csv(f"../data/option_cochanges/{repo_name}_option_cochanges.csv", index=False)


for file_path in glob.glob(f"../data/analyzed_projects/**"):
    file_name = file_path.split("/")[-1].split(".")[0]
    extract_option_cochanges(file_path, file_name)

**Compute Co-Changed Options (Globally)**

Definitions of columns
- `Across Projects` is an integer, indicating number of projects where the co-changed options changed as well at least once
- `Percentage globally` is an float, indicating the percentage of projects where the co-changed options changed as well
- `Changed globally` is an integer, indicating the total number of times the co-changed options changed in other projects

In [None]:
import os
import pandas as pd

def get_cochanged_options(target_df, other_dfs):
    """
    Analyze co-changed options in a target file against all other projects to compute global stats.

    :param target_df: dataframe of target project
    :param other_dfs: dataframes of all other projects
    :return target_df: updated dataframe of target project
    """
    # Initialize columns
    target_df['Across Projects'] = 0
    target_df['Changed globally'] = 0

    for index, row in target_df.iterrows():
        options = row['Co-Changed Options']

        for other_df in other_dfs:
            # Find all rows in other_df where the option matches
            matching_rows = other_df[other_df['Co-Changed Options'] == options]
            match_count = len(matching_rows)

            if match_count > 0:
                # Increment "Across Projects" by 1 (project-level count)
                target_df.loc[index, 'Across Projects'] += 1

                # Increment "Changed globally" by the total count of matches
                target_df.loc[index, 'Changed globally'] += match_count

    target_df['Percentage globally'] = round(target_df['Across Projects'] / len(other_dfs), 2)     

    return target_df


data_dir = "../data/option_cochanges"

# Load all CSV files from the directory into a dictionary of DataFrames
repository_files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
repository_dataframes = {file: pd.read_csv(os.path.join(data_dir, file)) for file in repository_files}

target_repo = "mall"
target_file_name = f'{target_repo}_option_cochanges.csv'
target_df = repository_dataframes[target_file_name]

# Use all other files as comparison
other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

# Perform the analysis
updated_target_df = get_cochanged_options(target_df.copy(), other_dfs)

updated_target_df.head(50)

updated_target_df.to_csv(f"../data/option_cochanges/{target_repo}_option_cochanges.csv")

updated_target_df