The **change frequency** of options indicates how often an options has been changed across the commit history. It also allows us to identify the value ranges of options. For options that do not change at all, we need to consider whether these options are truly necessary or if they can be removed to reduce complexity. For options that change frequently, we need to consider that these options can be sources of configuration drift and technical debt, or configuration errors.

In [38]:
import json
import pandas as pd
import ast
import glob
import os
from typing import List


data_file = "../data/microservice_projects/piggymetrics.json"

with open(data_file, "r", encoding="utf-8") as src:
    data = json.load(src)

In [39]:
def extract_options(data: List) -> pd.DataFrame:
    """
    Extract all options and all of their values from the commit history of a software projects.

    :param data: list of configuration data from commit history
    :return: dataframe contain all options and the values
    """
    config_data = []
    commit_data = [commit for commit in data["commit_data"] if commit["is_config_related"]]

    for commit in commit_data:
        for config_file in commit["network_data"]["config_files_data"]:
            # Dictionary to track option occurrences in the current file
            option_tracker = {}
            for pair in config_file["pairs"]:
                key = (config_file["file_path"], pair["option"])
                
                if key not in option_tracker:
                    option_tracker[key] = []
                option_tracker[key].append(pair)
        

            # Add only options that appear once in the file
            for key, occurrences in option_tracker.items():
                if len(occurrences) == 1:  # Include only unique options
                    pair = occurrences[0]
                    config_data.append({
                        "file_path": config_file["file_path"],
                        "option": pair["option"],
                        "value": pair["value"],
                        "type": pair["type"],
                        "concept": config_file["concept"]
                    })

    df = pd.DataFrame(config_data)

    # Group by option, type, and file_path, and aggregate unique values
    aggregated_df = (
        df.groupby(['file_path', 'option', 'concept'])['value']
        .apply(lambda x: sorted(list(set(x))))
        .reset_index()
    )

    aggregated_df.columns = ['File Path', 'Option', 'Concept', 'Values']
    aggregated_df['Internal Changes'] = aggregated_df['Values'].apply(lambda x: len(x) - 1 if len(x) > 1 else 0)

    return aggregated_df

df_options = extract_options(data=data)
df_options.to_csv(f"../data/options/{data['project_name']}_options_internal.csv", index=False)


In [40]:
def analyze_options(target_df: pd.DataFrame, other_dfs: List) -> pd.DataFrame:
    """
    Extract global statistics for options in a target project against all other project.

    :param target_df: dataframe of target project
    :param other_dfs: dataframes of all other projects
    :return target_df: updated dataframe of target project
    """
    # Initialize columns
    target_df['Gobally Set'] = 0
    target_df['Global Changes'] = 0
    target_df['Global Occurrences'] = 0


    for index, row in target_df.iterrows():
        option = row['Option']

        for other_df in other_dfs:
            # Find all rows in other_df where the option matches
            matching_rows = other_df[other_df['Option'] == option]
            match_count = len(matching_rows)

            if match_count > 0:
                # Increment "Set in other projects" by 1 (project-level count)
                target_df.loc[index, 'Gobally Set'] += 1

                # Increment "Total occurrences" by the total count of matches
                target_df.loc[index, 'Global Occurrences'] += match_count

                # Check each match for changes in values
                for _, match_row in matching_rows.iterrows():
                    # Parse the 'Values' column (convert from string to list if necessary)
                    raw_values = match_row['Values']
                    try:
                        values = ast.literal_eval(raw_values) if isinstance(raw_values, str) else raw_values
                    except (ValueError, SyntaxError):
                        values = [raw_values]  # Fall back to treating as a single value

                    # Ensure `values` is iterable
                    if not isinstance(values, (list, set, tuple)):
                        values = [values]

                    unique_values = set(values)
                    if len(unique_values) > 1:
                        # Increment "Changed globally" for each such occurrence
                        target_df.loc[index, 'Global Changes'] += 1

    return target_df


option_dir = "../data/options"

for file_name in glob.glob(f"{option_dir}/*_options_internal.csv"):
    target_repo = file_name.split("/")[-1].split("_")[0]

    repository_files = [file for file in os.listdir(option_dir) if file.endswith('_options_internal.csv')]
    repository_dataframes = {file: pd.read_csv(os.path.join(option_dir, file)) for file in repository_files}

    target_file_name = f'{target_repo}_options_internal.csv'
    target_df = repository_dataframes[target_file_name]

    # check if target df has specific columns
    if any(column_name in target_df.columns for column_name in ['Global Occurrences', 'Global changes', 'Gobally Set']):
        print(f"Skipping {target_repo} as it has already been processed.")
        continue 

    print(f"Extract global option stats for {target_repo}")

    # Use all other files as comparison
    other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

    # Perform the analysis
    updated_target_df = analyze_options(target_df.copy(), other_dfs)

    updated_target_df.head(50)

    updated_target_df.to_csv(f"../data/options/{target_repo}_options_global.csv")

Extract global option stats for piggymetrics
Extract global option stats for test-config-repo


*How often are options changed across the commit history (internally)?*

In [41]:
def show_internal_change_frequency(project_name: str):
    data_file = f"../data/options/{project_name}_options_internal.csv"
    df = pd.read_csv(data_file)

    length_distribution = df["Internal Changes"].value_counts().sort_index()
    total_options = length_distribution.sum()

    summary_df = pd.DataFrame({
        "Internal Changes": length_distribution.index,
        "Number of Options": length_distribution.values,
        "Percentage": (length_distribution.values / total_options * 100).round(2)
    })

    return summary_df


project_name = "piggymetrics"
df_frequency = show_internal_change_frequency(project_name=project_name)
df_frequency

Unnamed: 0,Internal Changes,Number of Options,Percentage
0,0,1398,92.95
1,1,92,6.12
2,2,6,0.4
3,3,3,0.2
4,4,2,0.13
5,5,3,0.2


*How often are options changed across software projects (globally)?*

In [42]:
def show_global_change_frequency(project_name: str):
    data_file = f"../data/options/{project_name}_options_global.csv"
    df = pd.read_csv(data_file)

    length_distribution = df["Global Changes"].value_counts().sort_index()
    total_options = length_distribution.sum()

    summary_df = pd.DataFrame({
        "Global Changes": length_distribution.index,
        "Number of Options": length_distribution.values,
    })

    return summary_df


project_name = "piggymetrics"
df_frequency = show_global_change_frequency(project_name=project_name)
df_frequency

Unnamed: 0,Global Changes,Number of Options
0,0,1481
1,1,23


*What options are changed most frequently (internally)?*

In [43]:
def show_most_internally_changed_options(project_name: str):
    data_file = f"../data/options/{project_name}_options_internal.csv"
    df = pd.read_csv(data_file)
    df = df.sort_values(by="Internal Changes", ascending=False)
    return df

project_name = "piggymetrics"
df_options = show_most_internally_changed_options(project_name=project_name)
df_options.head(10)

Unnamed: 0,File Path,Option,Concept,Values,Internal Changes
246,auth-service/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.1.RELEASE', '1.3.2.RELEA...",5
644,gateway/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.1.RELEASE', '1.3.2.RELEA...",5
1437,statistics-service/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.1.RELEASE', '1.3.2.RELEA...",5
1030,registry/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.2.RELEASE', '1.3.3.RELEA...",4
130,account-service/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.1.RELEASE', '1.3.2.RELEA...",4
731,monitoring/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.3.RELEASE', '1.3.5.RELEA...",3
307,config/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.3.RELEASE', '1.3.5.RELEA...",3
847,notification-service/pom.xml,project.parent.version,maven,"['1.0-SNAPSHOT', '1.3.3.RELEASE', '1.3.5.RELEA...",3
684,mongodb/Dockerfile,RUN,docker,['chmod +x /init.sh && apt-get update && apt-...,2
413,docker-compose.dev.yml,services.account-mongodb.ports,docker-compose,"['26000:27017', '27001:27017', '27017:27017']",2


*What options are changed most frequently (globally)?*

In [45]:
def show_most_globally_changed_options(project_name: str):
    data_file = f"../data/options/{project_name}_options_global.csv"
    df = pd.read_csv(data_file)
    df = df.sort_values(by="Global Changes", ascending=False)
    return df

project_name = "piggymetrics"
df_options = show_most_globally_changed_options(project_name=project_name)
df_options.head(10)

Unnamed: 0.1,Unnamed: 0,File Path,Option,Concept,Values,Internal Changes,Gobally Set,Global Changes,Global Occurrences
657,657,gateway/src/main/resources/application.yml,server.port,spring,"['4000', '7777']",1,1,1,2
987,987,registry/Dockerfile,EXPOSE,docker,['8761'],0,1,1,1
313,313,config/src/main/resources/application.yml,server.port,spring,['8888'],0,1,1,2
1472,1472,turbine-stream-service/Dockerfile,EXPOSE,docker,['8989'],0,1,1,1
1341,1341,statistics-service/Dockerfile,EXPOSE,docker,['7000'],0,1,1,1
149,149,account-service/src/main/resources/application...,server.port,spring,"['7000', '9999']",1,1,1,2
591,591,gateway/Dockerfile,EXPOSE,docker,"['4000', '8080']",1,1,1,1
165,165,auth-service/Dockerfile,EXPOSE,docker,['5000'],0,1,1,1
364,364,config/src/main/resources/shared/monitoring.yml,server.port,yaml,['9000'],0,1,1,2
348,348,config/src/main/resources/shared/gateway.yml,server.port,yaml,['4000'],0,1,1,2


*What is the value range of option (internally)?`*

In [61]:
def show_internal_value_range(project_name: str):
    project_file = f"../data/options/{project_name}_options_internal.csv"
    df = pd.read_csv(project_file)
    df_options = df.groupby('Option').agg({
        'Concept': 'first',  
        'Values': lambda x: list(set(val for sublist in x for val in eval(sublist))),
    }).reset_index()
    df_options['Num_Values'] = df_options['Values'].apply(len)

    return df_options

project_name = "piggymetrics"
df_value_range = show_internal_value_range(project_name=project_name)
df_value_range.head(20)

Unnamed: 0,Option,Concept,Values,Num_Values
0,/day,configparser,"[/ Day, / День]",2
1,/hour,configparser,"[/ Hour, / Час]",2
2,/month,configparser,"[/ Месяц, / Month]",2
3,/year,configparser,"[/ Year, / Год]",2
4,ADD,docker,"[./target/registry.jar /app/, ./target/config....",10
5,CMD,docker,[java -Xmx200m -jar /app/notification-service....,17
6,ENTRYPOINT,docker,"[/init.sh, /initx.sh]",2
7,EXPOSE,docker,"[6000, 8080, 8888, 8000, 7000, 5000, 8761, 400...",9
8,ExecutableName,maven,"[target/monitoring-0.0.1-SNAPSHOT.jar, target/...",12
9,ExecutableNameNoVersion,maven,"[target/config.jar, target/registry.jar, targe...",12


*What is the value range of options globally?*

In [60]:
def show_global_value_range():
    options_files = glob.glob("../data/options/*_options_internal.csv")
    result_dfs = []

    for file in options_files:
        df = pd.read_csv(file)
        df_options = df.groupby('Option').agg({
            'Concept': 'first',  
            'Values': lambda x: list(set(val for sublist in x for val in eval(sublist))),
        }).reset_index()
        df_options['Num_Values'] = df_options['Values'].apply(len)

        result_dfs.append(df_options)

    # Combine all DataFrames
    combined_df = pd.concat(result_dfs)

    df_result = combined_df.groupby('Option').agg({
        'Concept': 'first',
        'Values': lambda x: list(set(val for sublist in x for val in sublist)),
        'Num_Values': 'sum',
    }).reset_index()

    return df_result

df_value_range = show_global_value_range()
df_value_range.head(20)

Unnamed: 0,Option,Concept,Values,Num_Values
0,/day,configparser,"[/ Day, / День]",2
1,/hour,configparser,"[/ Hour, / Час]",2
2,/month,configparser,"[/ Месяц, / Month]",2
3,/year,configparser,"[/ Year, / Год]",2
4,ADD,docker,"[./target/registry.jar /app/, ./target/config....",10
5,CMD,docker,"[java -jar /app/gateway.jar, java -jar /app/co...",17
6,ENTRYPOINT,docker,"[/init.sh, java -jar app.jar, /initx.sh]",3
7,EXPOSE,docker,"[6000, 8080, 10000123330, 8888, 8000, 7000, 50...",12
8,ExecutableName,maven,"[target/monitoring-0.0.1-SNAPSHOT.jar, target/...",13
9,ExecutableNameNoVersion,maven,"[target/spring-boot-app.jar, target/registry.j...",13
