The **change frequency** of options indicates how often an options has been changed across the commit history. It also allows us to identify the value ranges of options. For options that do not change at all, we need to consider whether these options are truly necessary or if they can be removed to reduce complexity. For options that change frequently, we need to consider that these options can be sources of configuration drift and technical debt, or configuration errors.

In [2]:
import json
import pandas as pd
import ast
import glob
import os
from typing import Dict, List
from itertools import combinations
from collections import Counter, defaultdict

In [12]:
def extract_options(data: Dict) -> pd.DataFrame:
    """
    Extract all options and all of their values from the commit history of a software projects.

    :param data: list of configuration data from commit history
    :return: dataframe contain all options and the values
    """
    config_data = []
    commit_data = [commit for commit in data["commit_data"] if commit["is_config_related"]]
    latest_commit = next(filter(lambda x: x["is_latest_commit"], data["commit_data"]), None)

    for commit in commit_data:
        for config_file in commit["network_data"]["config_file_data"]:
            # Dictionary to track option occurrences in the current file
            option_tracker = {}
            for pair in config_file["pairs"]:
                key = (config_file["file_path"], pair["option"])
                
                if key not in option_tracker:
                    option_tracker[key] = []
                option_tracker[key].append(pair)
        

            # Add only options that appear once in the file
            for key, occurrences in option_tracker.items():
                if len(occurrences) == 1:  # Include only unique options
                    pair = occurrences[0]
                    config_data.append({
                        "file_path": config_file["file_path"],
                        "option": pair["option"],
                        "value": pair["value"],
                        "type": pair["type"],
                        "concept": config_file["concept"]
                    })


    df = pd.DataFrame(config_data)

    # Collect all options from latest commit
    latest_options = set()
    if latest_commit:
        for config_file in latest_commit["network_data"]["config_file_data"]:
            for pair in config_file["pairs"]:
                latest_options.add((config_file["file_path"], pair["option"]))

    # Aggregate
    aggregated_df = (
        df.groupby(['file_path', 'option', 'concept'])['value']
        .apply(lambda x: sorted(list(set(x))))
        .reset_index()
    )
    aggregated_df.columns = ['File Path', 'Option', 'Concept', 'Values']
    aggregated_df['Internal Changes'] = aggregated_df['Values'].apply(lambda x: len(x) - 1 if len(x) > 1 else 0)

    # Add presence in latest commit
    aggregated_df["In Latest Commit"] = aggregated_df.apply(
        lambda row: (row["File Path"], row["Option"]) in latest_options,
        axis=1
    )

    return aggregated_df


data_file = "../data/test_projects/piggymetrics.json"

with open(data_file, "r", encoding="utf-8") as src:
    data = json.load(src)

df_options = extract_options(data=data)
df_options.head(50)

Unnamed: 0,File Path,Option,Concept,Values,Internal Changes,In Latest Commit
0,.travis.yml,after_success,travis,"[bash <(curl -s https://codecov.io/bash), code...",1,True
1,.travis.yml,before_install,travis,"[pip install codecov, sudo pip install codecov]",1,False
2,.travis.yml,dist,travis,[trusty],0,True
3,.travis.yml,env.global,travis,"[COMMIT=${TRAVIS_COMMIT::7}, COMMIT=${TRAVIS_C...",1,True
4,.travis.yml,env.global.secure/GeONVsTD48Y88CKoqupo/FC1Gy0e...,travis,[GeONVsTD48Y88CKoqupo/FC1Gy0eCrT1UNylvMzz5VYcL...,0,True
5,.travis.yml,env.global.secure/Gl6a03cI88dKHV4rjP1IkYqCdVe7...,travis,[Gl6a03cI88dKHV4rjP1IkYqCdVe7IM0XNcEzFDCxmvXHW...,0,True
6,.travis.yml,env.global.secure/Kt8hRbEIkYW3CYJgvqEK9zCyS/l5...,travis,[Kt8hRbEIkYW3CYJgvqEK9zCyS/l54XOI90799xEuXWgFQ...,0,False
7,.travis.yml,env.global.secure/VRlJyPOz7fUmtFdpTdO51BVcjKUG...,travis,[VRlJyPOz7fUmtFdpTdO51BVcjKUGP5t7KF5bG7TSJPXsa...,0,True
8,.travis.yml,jdk,travis,[oraclejdk8],0,True
9,.travis.yml,language,travis,[java],0,True


In [4]:
def analyze_options_across_projects(target_df: pd.DataFrame, other_dfs: List) -> pd.DataFrame:
    """
    Extract global statistics for options in a target project against all other project.

    :param target_df: dataframe of target project
    :param other_dfs: dataframes of all other projects
    :return target_df: updated dataframe of target project
    """
    # Initialize columns
    target_df['Gobally Set'] = 0
    target_df['Global Changes'] = 0
    target_df['Global Occurrences'] = 0


    for index, row in target_df.iterrows():
        option = row['Option']

        for other_df in other_dfs:
            # Find all rows in other_df where the option matches
            matching_rows = other_df[other_df['Option'] == option]
            match_count = len(matching_rows)

            if match_count > 0:
                # Increment "Set in other projects" by 1 (project-level count)
                target_df.loc[index, 'Gobally Set'] += 1

                # Increment "Total occurrences" by the total count of matches
                target_df.loc[index, 'Global Occurrences'] += match_count

                # Check each match for changes in values
                for _, match_row in matching_rows.iterrows():
                    # Parse the 'Values' column (convert from string to list if necessary)
                    raw_values = match_row['Values']
                    try:
                        values = ast.literal_eval(raw_values) if isinstance(raw_values, str) else raw_values
                    except (ValueError, SyntaxError):
                        values = [raw_values]  # Fall back to treating as a single value

                    # Ensure `values` is iterable
                    if not isinstance(values, (list, set, tuple)):
                        values = [values]

                    unique_values = set(values)
                    if len(unique_values) > 1:
                        # Increment "Changed globally" for each such occurrence
                        target_df.loc[index, 'Global Changes'] += 1

    return target_df


# option_dir = "../data/options"

# for file_name in glob.glob(f"{option_dir}/*_options_internal.csv"):
#     target_repo = file_name.split("/")[-1].split("_")[0]

#     repository_files = [file for file in os.listdir(option_dir) if file.endswith('_options_internal.csv')]
#     repository_dataframes = {file: pd.read_csv(os.path.join(option_dir, file)) for file in repository_files}

#     target_file_name = f'{target_repo}_options_internal.csv'
#     target_df = repository_dataframes[target_file_name]

#     # check if target df has specific columns
#     if any(column_name in target_df.columns for column_name in ['Global Occurrences', 'Global changes', 'Gobally Set']):
#         print(f"Skipping {target_repo} as it has already been processed.")
#         continue 

#     print(f"Extract global option stats for {target_repo}")

#     # Use all other files as comparison
#     other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

#     # Perform the analysis
#     updated_target_df = analyze_options_across_projects(target_df.copy(), other_dfs)

#     updated_target_df.head(50)

#     updated_target_df.to_csv(f"../data/options/{target_repo}_options_global.csv")

*How often do configuration options change across the commit history?*

In [5]:
def show_change_frequency(df_options: pd.DataFrame) -> pd.DataFrame:

    length_distribution = df_options["Internal Changes"].value_counts().sort_index()
    total_options = length_distribution.sum()

    summary_df = pd.DataFrame({
        "Internal Changes": length_distribution.index,
        "Number of Options": length_distribution.values,
        "Percentage": (length_distribution.values / total_options * 100).round(2)
    })

    return summary_df

data_file = "../data/test_projects/test-config-repo.json"
with open(data_file, "r", encoding="utf-8") as src:
    data = json.load(src)

df_options = extract_options(data=data)
df_frequency = show_change_frequency(df_options=df_options)
df_frequency

Unnamed: 0,Internal Changes,Number of Options,Percentage
0,0,69,97.18
1,1,1,1.41
2,2,1,1.41


In [6]:
def show_aggregated_change_frequency(df_list: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Aggregates internal change distributions from multiple df_options DataFrames.
    Computes the average number of options and average percentage per internal change level.
    """
    count_dict = defaultdict(list)
    percentage_dict = defaultdict(list)

    for df_options in df_list:
        summary = show_change_frequency(df_options)
        for _, row in summary.iterrows():
            count_dict[row["Internal Changes"]].append(row["Number of Options"])
            percentage_dict[row["Internal Changes"]].append(row["Percentage"])

    all_change_levels = sorted(set(count_dict.keys()) | set(percentage_dict.keys()))
    
    aggregated_data = {
        "Internal Changes": [],
        "Avg. Number of Options": [],
        "Avg. Percentage": []
    }

    for change_level in all_change_levels:
        aggregated_data["Internal Changes"].append(int(change_level))
        aggregated_data["Avg. Number of Options"].append(
            round(sum(count_dict[change_level]) / len(count_dict[change_level]), 2)
        )
        aggregated_data["Avg. Percentage"].append(
            round(sum(percentage_dict[change_level]) / len(percentage_dict[change_level]), 2)
        )

    return pd.DataFrame(aggregated_data)

project_files = glob.glob("../data/test_projects/*.json")
df_list = []
for project_file in project_files:
    with open(project_file, "r", encoding="utf-8") as src:
        data = json.load(src)
    df_options = extract_options(data=data)
    df_list.append(df_options)

df_aggregated_frequency = show_aggregated_change_frequency(df_list=df_list)
df_aggregated_frequency

Unnamed: 0,Internal Changes,Avg. Number of Options,Avg. Percentage
0,0,733.5,95.06
1,1,46.5,3.76
2,2,3.5,0.9
3,3,3.0,0.2
4,4,2.0,0.13
5,5,3.0,0.2


*Which options change the most and what is their value range?*

In [7]:
def show_most_changed_options(df_options: pd.DataFrame) -> pd.DataFrame:
    df_sorted = df_options.sort_values(by="Internal Changes", ascending=False)
    return df_sorted

df_options = show_most_changed_options(df_options=df_options)
df_options.head(10)

Unnamed: 0,File Path,Option,Concept,Values,Internal Changes
1,src/Dockerfile,EXPOSE,docker,"[10000123330, 8000, 8080]",2
3,src/application.properties,server.port,spring,"[8000, 8080]",1
45,src/pom.xml,ExecutableName,maven,[target/spring-boot-app-0.0.1-SNAPSHOT.jar],0
51,src/pom.xml,project.dependencies.dependency.com.h2database...,maven,[com.h2database],0
50,src/pom.xml,project.dependencies.dependency.com.h2database...,maven,[h2],0
49,src/pom.xml,project.build.plugins.plugin.org.springframewo...,maven,[org.springframework.boot],0
48,src/pom.xml,project.build.plugins.plugin.org.springframewo...,maven,[spring-boot-maven-plugin],0
47,src/pom.xml,project.artifactId,maven,[spring-boot-app],0
46,src/pom.xml,ExecutableNameNoVersion,maven,[target/spring-boot-app.jar],0
44,src/docker-compose.yml,version,docker-compose,[3.8],0


In [8]:
def show_aggregated_most_changed_options(df_list: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Aggregates most changed options from multiple df_options DataFrames.
    Computes the average number of options and average percentage per internal change level.
    """
    pass

Across the commit history configuration options sometimes change together in the same commit or in subsequent commits, called **co-evolutionary changes**. Such changes may indicate relationships between configuration options that imply that these options need to be changed together to ensure their consistency. An analysis of co-evolutionary changes therefore may help to identify related options.

*Which options frequently change together?*

In [9]:
def summarize_commits(data, window_size):
    """
    Summarize commits in windows of a given size for configuration options.

    :param data: list of commit data
    :param window_size: size of the commit window
    :return: summarized list of commits with extracted options
    """
    summarized_data = []
    for i in range(0, len(data), window_size):
        window = data[i:i + window_size]
        modified_options = {}

        for commit in window:
            # Skip if no network data
            if not commit.get('network_data'):
                continue

            # Collect options from all files in this commit
            for file_data in commit['network_data'].get('config_file_data', []):
                for pair in file_data.get('modified_pairs', []):
                    # Store the modified option with its details
                    modified_options[pair['option']] = pair

        # Add unique options from the window to the summary
        summarized_data.append({"modified_options": modified_options})

    return summarized_data


def extract_option_cochanges(data: Dict, commit_window=1):
    """
    Extract co-changes of options in the commit history of a software project.

    :param data: list of configuration data from commit history of a software project
    :param commit_window: size of the commit window
    :return: dataframe containing co-changes of options
    """
    # Summarize commits in windows of the specified size
    summarized_commits = summarize_commits(data.get('commit_data', []), commit_window)

    # Counter to store co-changes
    option_pair_counts = Counter()

    # Process each summarized commit
    for summarized_commit  in summarized_commits:
        modified_options = summarized_commit.get("modified_options", {})
        
        # Generate unique pairs of modified options (by option name)
        option_names = list(modified_options.keys())
        option_pairs = [tuple(sorted(pair)) for pair in combinations(option_names, 2)]

        # Count occurrences of each pair and retain full details
        for option1, option2 in option_pairs:
            pair_details = (
                tuple(sorted(modified_options[option1].items())),
                tuple(sorted(modified_options[option2].items()))
            )
            option_pair_counts[pair_details] += 1

    # Prepare DataFrame data
    rows = []
    for (option1_details, option2_details), count in option_pair_counts.items():
        # Extract details for Option1 and Option2
        opt1 = dict(option1_details)
        opt2 = dict(option2_details)
        rows.append({
            "Co-Changed Options": (opt1.get('option'), opt2.get('option')),
            "Commit Window": commit_window,
            "Option1": opt1.get("option"),
            "Values1": (opt1.get("prev_value"), opt1.get("curr_value")),
            "Artifact1": opt1.get("artifact"),
            "Option2": opt2.get("option"),
            "Values2": (opt2.get("prev_value"), opt2.get("curr_value")),
            "Artifact2": opt2.get("artifact"),
            "Changed Internally": count,
            "Percentage Internally": count / len(summarized_commits)
        })

    return pd.DataFrame(rows).sort_values(by="Changed Internally", ascending=False)


data_file = "../data/test_projects/piggymetrics.json"
commit_window = 1

with open(data_file, "r", encoding="utf-8") as src:
    data = json.load(src)

df_cochanges = extract_option_cochanges(data=data, commit_window=commit_window)
df_cochanges

Unnamed: 0,Co-Changed Options,Commit Window,Option1,Values1,Artifact1,Option2,Values2,Artifact2,Changed Internally,Percentage Internally
144,"(project.parent.artifactId, project.parent.ver...",1,project.parent.artifactId,"(spring-boot-starter-parent, piggymetrics)",statistics-service/pom.xml,project.parent.version,"(2.0.3.RELEASE, 1.0-SNAPSHOT)",statistics-service/pom.xml,2,0.006897
143,"(project.parent.groupId, project.parent.version)",1,project.parent.groupId,"(org.springframework.boot, com.piggymetrics)",statistics-service/pom.xml,project.parent.version,"(2.0.3.RELEASE, 1.0-SNAPSHOT)",statistics-service/pom.xml,2,0.006897
142,"(project.parent.artifactId, project.parent.gro...",1,project.parent.artifactId,"(spring-boot-starter-parent, piggymetrics)",statistics-service/pom.xml,project.parent.groupId,"(org.springframework.boot, com.piggymetrics)",statistics-service/pom.xml,2,0.006897
0,"(project.name, project.packaging)",1,project.name,"(PiggyMetrics, piggymetrics)",pom.xml,project.packaging,"(war, pom)",pom.xml,1,0.003448
101,"(services.account-service.ports.host, services...",1,services.account-service.ports.host,"(7000, 6000)",docker-compose.dev.yml,services.statistics-service.ports.host,"(8000, 7000)",docker-compose.dev.yml,1,0.003448
...,...,...,...,...,...,...,...,...,...,...
50,(services.account-service.logging.options.max-...,1,services.account-service.logging.options.max-size,"(30m, 10m)",docker-compose.yml,services.gateway.logging.options.max-size,"(30m, 10m)",docker-compose.yml,1,0.003448
51,(services.account-mongodb.logging.options.max-...,1,services.account-mongodb.logging.options.max-size,"(30m, 10m)",docker-compose.yml,services.gateway.logging.options.max-size,"(30m, 10m)",docker-compose.yml,1,0.003448
52,"(services.gateway.logging.options.max-size, sp...",1,services.gateway.logging.options.max-size,"(30m, 10m)",docker-compose.yml,spring.data.mongodb.host,"(account-mongodb, statistics-mongodb)",statistics-service/src/main/resources/applicat...,1,0.003448
53,"(services.gateway.logging.options.max-size, sp...",1,services.gateway.logging.options.max-size,"(30m, 10m)",docker-compose.yml,spring.data.mongodb.username,"(service, user)",statistics-service/src/main/resources/applicat...,1,0.003448


To visualize the whole configuration space of a software project, we developed network of subgraphs where each subgraph corresponds to a specific technology in the ecosystem of the project. Each subhgraph consists of three types of nodes: Technology Node, Config File Node, and Option Node. Links between these nodes indicate their belonging. Links between options notes indicate co-evolutionary changes.

In [13]:
def create_graph_data(df_options: pd.DataFrame, df_cochanges: pd.DataFrame) -> dict:
    """
    Create graph data from the options dataframe.
    
    :param df_options: dataframe containing all options from the commit history
    :return: dataframe contain all options and the values
    """

    # Create a list of unique nodes
    concepts = df_options['Concept'].unique()
    artifacts = df_options['File Path'].unique()
    options = df_options[['File Path', 'Option', 'Values', 'Internal Changes', 'In Latest Commit']].apply(tuple, axis=1)

    # Add nodes
    nodes = [{'id': concept, 'type': 'concept'} for concept in concepts]
    nodes += [{'id': artifact, 'type': 'artifact'} for artifact in artifacts]
    nodes += [{'id': f"{option[0]}:{option[1]}", 'values': str(option[2]), 'changed_internally': option[3], 'latest_commit': option[4], 'type': 'option'} for option in options]
    # Define links
    links = []

    # Link concepts to artifacts
    for _, row in df_options.iterrows():
        links.append({'source': row['Concept'], 'target': row['File Path'], 'type': 'concept-artifact'})
        
    # Link artifacts to options
    for _, row in df_options.iterrows():
        artifact = row["File Path"]
        option = row["Option"]
        links.append({'source': row['File Path'], 'target': f"{artifact}:{option}", 'type': 'artifact-option'})

    # Links options to options
    for _, row in df_cochanges.iterrows():
        # Create a link for the concept relationship
        links.append({
            'source': f"{row['Artifact1']}:{row['Option1']}",
            'target': f"{row['Artifact2']}:{row['Option2']}",
            'source_option': row['Option1'].split(":")[-1],
            'target_option': row['Option2'].split(":")[-1],
            'type': 'option-option',
            'commit_window': row['Commit Window'],
            'internal_weight': float(row['Percentage Internally']),
            'internal_count': row['Changed Internally'], 
        })

    graph_data = {'nodes': nodes, 'links': links}

    return graph_data


data_file = "../data/test_projects/piggymetrics.json"
project_dir = data_file.split("/")[-2].split(".")[0]

with open(data_file, "r", encoding="utf-8") as src:
    data = json.load(src)

df_options = extract_options(data=data)
df_cochanges = extract_option_cochanges(data=data)

graph_data = create_graph_data(df_options, df_cochanges)

output_path = f'../data/graph_data/{data["project_name"]}_graph_data.json'
with open(output_path, 'w') as f:
    json.dump(graph_data, f)

