**Extract all concept and collect their internal change frequency**

Definition of columns
- `Changed internally` is an integer, indicating how often the value of a config file was changes in the project

In [1]:
import pandas as pd
from typing import List, Tuple, Dict


def extract_concept_data(data: Dict) -> Tuple[pd.DataFrame]:
    """
    Extract all concepts from the commit history of a software projects and their change frequency.

    :param data: configuration data from commit history
    :return: tuple of dataframes containing the results and excluded options
    """

    project_name = data["project_name"]
    print(f"Extract all config files from {project_name}.")

    concept_data = {}

    for commit in data["config_commit_data"]:
        if commit["is_config_related"]:
            for concept_name in commit["network_data"]["concepts"]:
                if concept_name not in concept_data:
                    concept_data[concept_name] = {
                        "Concept": concept_name,
                        "Change_count": 0
                    }
                concept_data[concept_name]["Change_count"] += 1

    # Create DataFrame from the extracted data
    df = pd.DataFrame(list(concept_data.values()))

    return df

In [2]:
import glob
import json
import os 

analyzed_project_dir = "../data/analyzed_projects"

for project_path in glob.glob(analyzed_project_dir + "/**"):
    with open(project_path, "r", encoding="utf-8") as src:
        data = json.load(src)
        project_name = data["project_name"]
        output_file = f"../data/extracted_config_data/{project_name}_concepts.csv"
        
        if os.path.exists(output_file):
            print(f"Skipping {project_name} as it already exists.")
            continue

        df_result = extract_concept_data(data=data)
        df_result.to_csv(output_file, index=False)



Extract all config files from poli.
Skipping pig as it already exists.
Skipping music-website as it already exists.
Skipping apollo as it already exists.
Skipping kkFileView as it already exists.
Skipping mall-swarm as it already exists.
Skipping piggymetrics as it already exists.
Skipping litemall as it already exists.
Skipping Spring-Cloud-Platform as it already exists.
Skipping mall as it already exists.


**Extract all files and collect all their internal change frequency**

Definition of columns
- `Changed internally` is an integer, indicating how often the value of a config file was changes in the project

In [3]:
import pandas as pd
from typing import List, Tuple, Dict


def extract_config_file_data(data: Dict) -> Tuple[pd.DataFrame]:
    """
    Extract all config files from the commit history of a software projects.

    :param data: configuration data from commit history
    :return: tuple of dataframes containing the results and excluded options
    """

    project_name = data["project_name"]
    print(f"Extract all config files from {project_name}.")

    config_data = {}

    for commit in data["config_commit_data"]:
        if commit["is_config_related"]:
            for file_data in commit["network_data"]["config_files_data"]:
                file_path = file_data["file_path"]
                if file_path not in config_data and len(file_data["pairs"]) > 0:
                    config_data[file_path] = {
                        "File_path": file_path,
                        "Name": file_path.split("/")[-1],
                        "Concept": file_data["concept"],
                        "Change_count": 0
                    }

                if file_path in config_data:
                    config_data[file_path]["Change_count"] += 1

    # Create DataFrame from the extracted data
    df = pd.DataFrame(list(config_data.values()))

    return df

In [4]:
import glob
import json

analyzed_project_dir = "../data/analyzed_projects"

for project_path in glob.glob(analyzed_project_dir + "/**"):
    with open(project_path, "r", encoding="utf-8") as src:
        data = json.load(src)
        project_name = data["project_name"]
        output_file = f"../data/extracted_config_data/{project_name}_files.csv"

        if os.path.exists(output_file):
            print(f"Skipping {project_name} as it already exists.")
            continue

        df_result = extract_config_file_data(data=data)
        df_result.to_csv(output_file, index=False)

Extract all config files from poli.
Skipping pig as it already exists.
Skipping music-website as it already exists.
Skipping apollo as it already exists.
Skipping kkFileView as it already exists.
Skipping mall-swarm as it already exists.
Skipping piggymetrics as it already exists.
Skipping litemall as it already exists.
Skipping Spring-Cloud-Platform as it already exists.
Skipping mall as it already exists.


**Extract all options and collect all their values across the commit history**

Problems (TODO)
- options in config files that appear multiple time, such as COPY/ADD/RUN/FROM in Dockerfile
- there is no way to reliably track each option seperately
- therefore we currently exclude such options

Definition of columns
- `Changed internally` is an integer, indicating how often the value of an option was changes in the project
- `Removed` is a boolean, indicating if an option has been removed at some point 

In [5]:
import pandas as pd
from typing import List, Tuple


def extract_options(data: List) -> Tuple:
    """
    Extract all options and all of their values from the commit history of a software projects.

    :param data: list of configuration data from commit history
    :return: tuple of dataframes containing the results and excluded options
    """

    project_name = data["project_name"]
    print(f"Extract all options and their values from {project_name}.")

    # Extract configuration options and their values, excluding duplicates
    config_data = []
    excluded_pairs = set()
    option_presence_tracker = {}  # Track presence across commits

    for commit in data["config_commit_data"]:
        if commit["is_config_related"]:
            commit_hash = commit["commit_hash"]
            for file_data in commit["network_data"]["config_files_data"]:
                # Dictionary to track option occurrences in the current file
                option_tracker = {}
                for pair in file_data["pairs"]:
                    key = (file_data["file_path"], pair["option"])
                    
                    if key not in option_tracker:
                        option_tracker[key] = []
                    option_tracker[key].append(pair)
                    
                    # Update the presence tracker
                    if key not in option_presence_tracker:
                        option_presence_tracker[key] = {"last_seen": commit_hash, "removed": False}
                    else:
                        option_presence_tracker[key]["last_seen"] = commit_hash
                        option_presence_tracker[key]["removed"] = False  # Mark as seen in this commit

                # Add only options that appear once in the file
                for key, occurrences in option_tracker.items():
                    if len(occurrences) == 1:  # Include only unique options
                        pair = occurrences[0]
                        config_data.append({
                            "file_path": file_data["file_path"],
                            "option": pair["option"],
                            "value": pair["value"],
                            "type": pair["type"],
                            "concept": file_data["concept"]
                        })
                    else:
                        pair = occurrences[0]
                        excluded_pairs.add((file_data["file_path"], pair["option"], file_data["concept"]))

    # After processing all commits, check for removed options
    for key, data in option_presence_tracker.items():
        if data["last_seen"] != commit_hash:  # If not seen in the last commit, mark as removed
            option_presence_tracker[key]["removed"] = True

    # Create DataFrame from the extracted data
    df = pd.DataFrame(config_data)

    df_excluded = pd.DataFrame(list(excluded_pairs))

    # store excludes options only if dataframe is not empty
    if not df_excluded.empty:
        df_excluded.columns = ["File", "Option", "Concept"]

    # Group by option, type, and file_path, and aggregate unique values
    aggregated_df = (
        df.groupby(['file_path', 'option', 'concept'])['value']
        .apply(lambda x: sorted(list(set(x))))
        .reset_index()
    )

    # Rename columns for clarity
    aggregated_df.columns = ['File Path', 'Option', 'Concept', 'Values']

    # Add and 'changed internally' columns
    aggregated_df['Changed internally'] = aggregated_df['Values'].apply(lambda x: len(x) - 1 if len(x) > 1 else 0)

    # Add 'removed' column by checking the option presence tracker
    removed_status = []
    for _, row in aggregated_df.iterrows():
        key = (row['File Path'], row['Option'])
        removed_status.append(option_presence_tracker.get(key, {}).get('removed', False))

    aggregated_df['Removed'] = removed_status

    return aggregated_df, df_excluded

In [2]:
import glob
import json

analyzed_project_dir = "../data/analyzed_projects"

for project_path in glob.glob(analyzed_project_dir + "/**"):
    with open(project_path, "r", encoding="utf-8") as src:
        data = json.load(src)
        project_name = data["project_name"]

        output_file = f"../data/extracted_config_data/{project_name}_options.csv"

        if os.path.exists(output_file):
            print(f"Skipping {project_name} as it already exists.")
            continue

        df_result, df_excluded = extract_options(data=data)
        df_excluded.to_csv(f"../data/excluded_options/{project_name}_excluded.csv", index=False)
        df_result.to_csv(f"../data/extracted_config_data/{project_name}_options.csv", index=False)

Skipping poli as it already exists.
Skipping pig as it already exists.
Skipping music-website as it already exists.
Skipping apollo as it already exists.
Skipping kkFileView as it already exists.
Skipping mall-swarm as it already exists.
Skipping piggymetrics as it already exists.
Skipping litemall as it already exists.
Skipping Spring-Cloud-Platform as it already exists.
Skipping mall as it already exists.


**Extract if an options was set in other projects if the option was changed**

Definitions of columns
- `Changed globally` is an integer, indicating if an option was changed in other projects
- `Set globally` is an integer, indicating the number of projects in which the option exists
- `Occurrences globally` is an integer, indicating how often the option occurs across all projects

In [2]:
import os
import ast
import pandas as pd
import glob


def analyze_options(target_df, other_dfs) -> pd.DataFrame:
    """
    Analyze options in a target file against all other files to compute global stats.

    :param target_df: dataframe of target project
    :param other_dfs: dataframes of all other projects
    :return target_df: updated dataframe of target project
    """
    # Initialize columns
    target_df['Set globally'] = 0
    target_df['Changed globally'] = 0
    target_df['Occurrences globally'] = 0


    for index, row in target_df.iterrows():
        option = row['Option']

        for other_df in other_dfs:
            # Find all rows in other_df where the option matches
            matching_rows = other_df[other_df['Option'] == option]
            match_count = len(matching_rows)

            if match_count > 0:
                # Increment "Set in other projects" by 1 (project-level count)
                target_df.loc[index, 'Set globally'] += 1

                # Increment "Total occurrences" by the total count of matches
                target_df.loc[index, 'Occurrences globally'] += match_count

                # Check each match for changes in values
                for _, match_row in matching_rows.iterrows():
                    # Parse the 'Values' column (convert from string to list if necessary)
                    raw_values = match_row['Values']
                    try:
                        values = ast.literal_eval(raw_values) if isinstance(raw_values, str) else raw_values
                    except (ValueError, SyntaxError):
                        values = [raw_values]  # Fall back to treating as a single value

                    # Ensure `values` is iterable
                    if not isinstance(values, (list, set, tuple)):
                        values = [values]

                    unique_values = set(values)
                    if len(unique_values) > 1:
                        # Increment "Changed globally" for each such occurrence
                        target_df.loc[index, 'Changed globally'] += 1

    return target_df


data_dir = "../data/extracted_config_data"

for file_name in glob.glob(data_dir + "/*options.csv"):
    target_repo = file_name.split("/")[-1].split("_")[0]

    repository_files = [file for file in os.listdir(data_dir) if file.endswith('_options.csv')]
    repository_dataframes = {file: pd.read_csv(os.path.join(data_dir, file)) for file in repository_files}

    target_file_name = f'{target_repo}_options.csv'
    target_df = repository_dataframes[target_file_name]

    # check if target df has specific columns
    if 'Set globally' in target_df.columns:
        print(f"Skipping {target_repo} as it has already been processed.")
        continue 

    print(f"Extract global option stats for {target_repo}")

    # Use all other files as comparison
    other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

    # Perform the analysis
    updated_target_df = analyze_options(target_df.copy(), other_dfs)

    updated_target_df.head(50)

    updated_target_df.to_csv(f"../data/extracted_config_data/{target_repo}_options.csv")

updated_target_df.head(50)


Skipping mall as it has already been processed.
Skipping litemall as it has already been processed.
Skipping music-website as it has already been processed.
Skipping poli as it has already been processed.
Skipping pig as it has already been processed.
Skipping test-config-repo as it has already been processed.
Extract global option stats for kkFileView
Skipping piggymetrics as it has already been processed.
Skipping Spring-Cloud-Platform as it has already been processed.
Skipping apollo as it has already been processed.
Skipping mall-swarm as it has already been processed.


Unnamed: 0,File Path,Option,Concept,Values,Changed internally,Removed,Set globally,Changed globally,Occurrences globally
0,.github/workflows/maven.yml,jobs.build.runs-on,github-action,"['ubuntu-18.04', 'ubuntu-22.04']",1,False,2,0,6
1,.github/workflows/maven.yml,jobs.build.steps.run,github-action,['mvn -B package --file pom.xml'],0,False,0,0,0
2,.github/workflows/maven.yml,jobs.build.steps.with.cache,github-action,['maven'],0,False,0,0,0
3,.github/workflows/maven.yml,jobs.build.steps.with.distribution,github-action,['adopt'],0,False,1,1,2
4,.github/workflows/maven.yml,jobs.build.steps.with.java-version,github-action,['8'],0,False,2,1,3
5,.github/workflows/maven.yml,name,github-action,['Java CI with Maven'],0,False,5,12,41
6,.github/workflows/maven.yml,on.pull_request.branches,github-action,['master'],0,False,2,0,5
7,.github/workflows/maven.yml,on.push.branches,github-action,['master'],0,False,2,1,5
8,.workflow/BranchPipeline.yml,displayName,yaml,['BranchPipeline'],0,True,1,0,3
9,.workflow/BranchPipeline.yml,name,yaml,['branch-pipeline'],0,True,5,12,41
