In [1]:
from typing import Dict
from cfgnet.network.network_configuration import NetworkConfiguration
from cfgnet.network.nodes import ArtifactNode
from cfgnet.network.network import Network
from pprint import pprint
from tqdm import tqdm
from typing import List
import git
import json
import subprocess
import traceback
import glob
import time

config_file_endings = (".xml", ".yml", ".yaml", "Dockerfile", ".ini", ".properties", ".conf", ".json", ".toml", ".cfg", "settings.py", ".cnf")

def checkout_latest_commit(repo, current_branch, latest_commit):
     # Return to the latest commit
    if current_branch:
        # If we were on a branch, return to it
        repo.git.checkout(current_branch)
        print(f"Returned to original branch: {current_branch}")
    else:
        # If we were in a detached HEAD state, checkout the latest commit directly
        repo.git.checkout(latest_commit)
        print(f"Returned to the latest commit: {latest_commit}")


def analyze_config_network(repo_path: str):
    
    network_config = NetworkConfiguration(
        project_root_abs=repo_path,
        enable_static_blacklist=False,
        enable_internal_links=True,
        enable_all_conflicts=True,
        enable_file_type_plugins=True,
        system_level=False
    )

    network = Network.init_network(cfg=network_config)

    artifacts = network.get_nodes(node_type=ArtifactNode)

    config_files_data = []
    for artifact in artifacts:
        pairs = artifact.get_pairs()

        # exclude file options
        pairs = [pair for pair in pairs if pair["option"] != "file"] 

        config_files_data.append({
            "file_path": artifact.rel_file_path,
            "concept": artifact.concept_name,
            "options": len(artifact.get_pairs()),
            "pairs": pairs
        })


    config_files = set(artifact.rel_file_path for artifact in artifacts)
  	
    network_data = {
        "links": len(network.links),
        "config_files": list(config_files),
        "config_files_data": config_files_data
    }

    return network_data


def get_file_diff(repo_path: str, commit, file_path: str):
    if commit.parents:
        parent_commit = f"{commit.hexsha}^"
            
        try:                        
            # Run git diff to capture line-by-line changes
            diff_output = subprocess.check_output(
                ['git', 'diff', parent_commit, commit.hexsha, '--', file_path],
                cwd=repo_path,
                text=True
            )
            return diff_output
        except (subprocess.CalledProcessError, git.exc.GitCommandError) as e:
            print(f"Error running git diff for commit {commit.hexsha}: {e}")
            return None


def analyze_repository(repo_path: str, get_diff: bool = False) -> Dict:
    """Analyze Commit history of repositories and collect stats about the configuration space."""  
    start_time = time.time()
    project_name = repo_path.split("/")[-1]
    repo = git.Repo(repo_path)

    # Save the current branch to return to it later
    current_branch = repo.active_branch.name if not repo.head.is_detached else None
    latest_commit = repo.head.commit.hexsha
    parent_commit = None

    # Get all commits in the repository from oldest to newest
    commits = list(repo.iter_commits("HEAD"))[::-1]

    print(f"Number of commits: {len(commits)}")

    config_commit_data = []

    for commit in tqdm(commits, desc="Processing", total=len(commits)):

        is_config_related = False

        # Get commit stats
        stats = commit.stats.total

        # Checkout the commit
        repo.git.checkout(commit.hexsha)

        # check if commit is config-related
        if any(file_path.endswith(config_file_endings) for file_path in commit.stats.files.keys()):
            is_config_related = True
            
            # Run the external analysis for config-related commits
            try: 
                network_data = analyze_config_network(repo_path=repo_path)
            except Exception:
                print(f"Error occurred in commit {commit.hexsha}")
                print({traceback.print_exc()})
                return

            # Get general stats per config file
            for file_path, file_stats in commit.stats.files.items():
                
                # Get config file data
                if file_path in network_data["config_files"]:
                    file_data = next(filter(lambda x: x["file_path"] == file_path, network_data["config_files_data"]))
                    file_data["insertions"] = file_stats['insertions']
                    file_data["deletions"] = file_stats['deletions']
                    file_data["total_changes"] = file_stats['insertions'] + file_stats['deletions']

                    # Get config file diff
                    if get_diff:
                        diff_output = get_file_diff(
                            repo_path=repo_path,
                            commit=commit,
                            file_path=file_path
                        )

                        file_data["diff"] = diff_output

            config_commit_data.append(
                {   
                    "commit_hash": str(commit.hexsha),
                    "parent_commit": (parent_commit),
                    "is_config_related": is_config_related,
                    "author": f"{commit.author.name} <{commit.author.email}>",
                    "commit_mgs": str(commit.message),
                    "files_changed": stats['files'],
                    "insertions": stats['insertions'],
                    "deletions": stats['deletions'],
                    "network_data": network_data
                }
            )
        
        else:
            config_commit_data.append(
                {   
                    "commit_hash": str(commit.hexsha),
                    "parent_commit": (parent_commit),
                    "is_config_related": is_config_related,
                    "author": f"{commit.author.name} <{commit.author.email}>",
                    "commit_mgs": str(commit.message),
                    "files_changed": stats['files'],
                    "insertions": stats['insertions'],
                    "deletions": stats['deletions'],
                    "network_data": None
                }
            )


    # Return to latest commit
    checkout_latest_commit(
        repo=repo, 
        current_branch=current_branch,
        latest_commit=latest_commit
    )

    print(f"Len commit data: {len(config_commit_data)}, {round(len(config_commit_data)/len(commits), 2)}")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time:.6f} seconds")
    
    return {
        "project_name": project_name,
        "analysis_time": elapsed_time,
        "len_commits": len(commits),
        "config_commit_data": config_commit_data
    }

In [2]:
project_path = "/home/simisimon/GitHub/projects/test_project_history"

commit_data = analyze_repository(repo_path=project_path, get_diff=True)

Number of commits: 9


Processing: 100%|██████████| 9/9 [00:00<00:00, 14.03it/s]

Returned to the latest commit: 0fcb960078420522114b22657edae9e491619aea
Len commit data: 9, 1.0
Elapsed time: 0.688508 seconds





In [3]:
project_dir = "/home/simisimon/GitHub/cfgnet_evaluation"

for project_path in glob.glob(project_dir + "/**"):
    project_name = project_path.split("/")[-1]
    
    commit_data = analyze_repository(repo_path=project_path, get_diff=True)

    output_file = f"../data/analyzed_projects/{project_name}.json"

    print(f"Write commit data into file {output_file}")
    with open(output_file, "w", encoding="utf-8") as dest:
        json.dump(commit_data, dest, indent=2)

Number of commits: 72


Processing: 100%|██████████| 72/72 [00:05<00:00, 14.38it/s]


Returned to the latest commit: 6fa821354eaa95a8b0b52bdcda1bbf89dbe29d69
Len commit data: 72, 1.0
Elapsed time: 5.093046 seconds
Write commit data into file ../data/analyzed_projects/mentorship-platform.json
Number of commits: 121


  in "/home/simisimon/GitHub/cfgnet_evaluation/netflix-oss-example/spring-cloud-dashboard/src/main/resources/application.yml", line 1, column 1
but found another document
  in "/home/simisimon/GitHub/cfgnet_evaluation/netflix-oss-example/spring-cloud-dashboard/src/main/resources/application.yml", line 9, column 1"
  in "/home/simisimon/GitHub/cfgnet_evaluation/netflix-oss-example/spring-cloud-dashboard/src/main/resources/application.yml", line 1, column 1
but found another document
  in "/home/simisimon/GitHub/cfgnet_evaluation/netflix-oss-example/spring-cloud-dashboard/src/main/resources/application.yml", line 9, column 1"
Processing: 100%|██████████| 121/121 [00:12<00:00,  9.65it/s]


Returned to original branch: master
Len commit data: 121, 1.0
Elapsed time: 12.612880 seconds
Write commit data into file ../data/analyzed_projects/netflix-oss-example.json
Number of commits: 288


Processing: 100%|██████████| 288/288 [00:44<00:00,  6.46it/s]


Returned to original branch: master
Len commit data: 288, 1.0
Elapsed time: 44.659809 seconds
Write commit data into file ../data/analyzed_projects/piggymetrics.json
Number of commits: 34


Processing: 100%|██████████| 34/34 [00:03<00:00,  9.50it/s]


Returned to original branch: master
Len commit data: 34, 1.0
Elapsed time: 3.680300 seconds
Write commit data into file ../data/analyzed_projects/spring-boot-blog.json
Number of commits: 88


Processing: 100%|██████████| 88/88 [00:07<00:00, 11.46it/s]


Returned to original branch: development
Len commit data: 88, 1.0
Elapsed time: 7.765865 seconds
Write commit data into file ../data/analyzed_projects/taskManagement.json
Number of commits: 87


Processing: 100%|██████████| 87/87 [00:05<00:00, 14.57it/s]

Returned to the latest commit: e819b3f4b18079d1f6bd751959d0f9faa801a080
Len commit data: 87, 1.0
Elapsed time: 6.055671 seconds
Write commit data into file ../data/analyzed_projects/Ward.json





**Extract all options and collect all their values across the commit history**

Problems (TODO)
- options in config files that appear multiple time, such as COPY/ADD/RUN/FROM in Dockerfile
- there is no way to reliably track each option seperately
- therefore we currently exclude such options

Definition of columns
- `Changed internally` is an integer, indicating how often the value of an option was changes in the project
- `Removed` is a boolean, indicating if an option has been removed at some point 

In [4]:
import pandas as pd
from typing import List, Tuple


def extract_options(data: List) -> Tuple:
    """
    Extract all options and all of their values from the commit history of a software projects.

    :param data: list of configuration data from commit history
    :return: tuple of dataframes containing the results and excluded options
    """

    project_name = data["project_name"]
    print(f"Extract all options and their values from {project_name}.")

    # Extract configuration options and their values, excluding duplicates
    config_data = []
    excluded_pairs = set()
    option_presence_tracker = {}  # Track presence across commits

    for commit in data["config_commit_data"]:
        if commit["is_config_related"]:
            commit_hash = commit["commit_hash"]
            for file_data in commit["network_data"]["config_files_data"]:
                # Dictionary to track option occurrences in the current file
                option_tracker = {}
                for pair in file_data["pairs"]:
                    key = (file_data["file_path"], pair["option"])
                    
                    if key not in option_tracker:
                        option_tracker[key] = []
                    option_tracker[key].append(pair)
                    
                    # Update the presence tracker
                    if key not in option_presence_tracker:
                        option_presence_tracker[key] = {"last_seen": commit_hash, "removed": False}
                    else:
                        option_presence_tracker[key]["last_seen"] = commit_hash
                        option_presence_tracker[key]["removed"] = False  # Mark as seen in this commit

                # Add only options that appear once in the file
                for key, occurrences in option_tracker.items():
                    if len(occurrences) == 1:  # Include only unique options
                        pair = occurrences[0]
                        config_data.append({
                            "file_path": file_data["file_path"],
                            "option": pair["option"],
                            "value": pair["value"],
                            "type": pair["type"],
                            "concept": file_data["concept"]
                        })
                    else:
                        pair = occurrences[0]
                        excluded_pairs.add((file_data["file_path"], pair["option"], file_data["concept"]))

    # After processing all commits, check for removed options
    for key, data in option_presence_tracker.items():
        if data["last_seen"] != commit_hash:  # If not seen in the last commit, mark as removed
            option_presence_tracker[key]["removed"] = True

    # Create DataFrame from the extracted data
    df = pd.DataFrame(config_data)

    df_excluded = pd.DataFrame(list(excluded_pairs))

    # store excludes options only if dataframe is not empty
    if not df_excluded.empty:
        df_excluded.columns = ["File", "Option", "Concept"]

    # Group by option, type, and file_path, and aggregate unique values
    aggregated_df = (
        df.groupby(['file_path', 'option', 'concept'])['value']
        .apply(lambda x: sorted(list(set(x))))
        .reset_index()
    )

    # Rename columns for clarity
    aggregated_df.columns = ['File Path', 'Option', 'Concept', 'Values']

    # Add and 'changed internally' columns
    aggregated_df['Changed internally'] = aggregated_df['Values'].apply(lambda x: len(x) - 1 if len(x) > 1 else 0)

    # Add 'removed' column by checking the option presence tracker
    removed_status = []
    for _, row in aggregated_df.iterrows():
        key = (row['File Path'], row['Option'])
        removed_status.append(option_presence_tracker.get(key, {}).get('removed', False))

    aggregated_df['Removed'] = removed_status

    return aggregated_df, df_excluded



  machar = _get_machar(dtype)


In [5]:
import glob
import json

analyzed_project_dir = "../data/analyzed_projects"

for project_path in glob.glob(analyzed_project_dir + "/**"):
    with open(project_path, "r", encoding="utf-8") as src:
        data = json.load(src)
        project_name = data["project_name"]

        df_result, df_excluded = extract_options(data=data)

        df_excluded.to_csv(f"../data/excluded_options/{project_name}_excluded.csv", index=False)
        df_result.to_csv(f"../data/extracted_options/{project_name}_options.csv", index=False)

Extract all options and their values from mentorship-platform.
Extract all options and their values from netflix-oss-example.
Extract all options and their values from piggymetrics.
Extract all options and their values from spring-boot-blog.
Extract all options and their values from taskManagement.
Extract all options and their values from test_project_history.
Extract all options and their values from Ward.


**Extract if an options was set in other projects if the option was changed**

Definitions of columns
- `Changed globally` is an integer, indicating if an option was changed in other projects
- `Set globally` is an integer, indicating the number of projects in which the option exists
- `Occurrences globally` is an integer, indicating how often the option occurs across all projects

In [7]:
import os
import ast


def analyze_options(target_df, other_dfs) -> pd.DataFrame:
    """
    Analyze options in a target file against all other files to compute global stats.

    :param target_df: dataframe of target project
    :param other_dfs: dataframes of all other projects
    :return target_df: updated dataframe of target project
    """
    # Initialize columns
    target_df['Set globally'] = 0
    target_df['Changed globally'] = 0
    target_df['Occurrences globally'] = 0


    for index, row in target_df.iterrows():
        option = row['Option']

        for other_df in other_dfs:
            # Find all rows in other_df where the option matches
            matching_rows = other_df[other_df['Option'] == option]
            match_count = len(matching_rows)

            if match_count > 0:
                # Increment "Set in other projects" by 1 (project-level count)
                target_df.loc[index, 'Set globally'] += 1

                # Increment "Total occurrences" by the total count of matches
                target_df.loc[index, 'Occurrences globally'] += match_count

                # Check each match for changes in values
                for _, match_row in matching_rows.iterrows():
                    # Parse the 'Values' column (convert from string to list if necessary)
                    raw_values = match_row['Values']
                    try:
                        values = ast.literal_eval(raw_values) if isinstance(raw_values, str) else raw_values
                    except (ValueError, SyntaxError):
                        values = [raw_values]  # Fall back to treating as a single value

                    # Ensure `values` is iterable
                    if not isinstance(values, (list, set, tuple)):
                        values = [values]

                    unique_values = set(values)
                    if len(unique_values) > 1:
                        # Increment "Changed globally" for each such occurrence
                        target_df.loc[index, 'Changed globally'] += 1

    return target_df


data_dir = "../data/extracted_options"

# Load all CSV files from the directory into a dictionary of DataFrames
repository_files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
repository_dataframes = {file: pd.read_csv(os.path.join(data_dir, file)) for file in repository_files}

target_file_name = 'spring-boot-blog_options.csv'
target_df = repository_dataframes[target_file_name]

# Use all other files as comparison
other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

# Perform the analysis
updated_target_df = analyze_options(target_df.copy(), other_dfs)

updated_target_df.head(50)


Unnamed: 0,File Path,Option,Concept,Values,Changed internally,Removed,Set globally,Changed globally,Occurrences globally
0,.mvn/wrapper/maven-wrapper.properties,distributionurl,configparser,['https://repo1.maven.org/maven2/org/apache/ma...,0,False,3,0,7
1,docker/Dockerfile,COPY,docker,['target/blog-demo-0.0.1-SNAPSHOT.jar $APP_HOM...,0,False,1,0,1
2,docker/Dockerfile,ENTRYPOINT,docker,['exec java -jar app.jar'],0,False,2,1,2
3,docker/Dockerfile,EXPOSE,docker,['8090'],0,False,3,4,20
4,docker/Dockerfile,FROM,docker,['openjdk:8u151-jdk-alpine3.7'],0,False,3,1,23
5,docker/Dockerfile,WORKDIR,docker,['$APP_HOME'],0,False,2,0,12
6,pom.xml,ExecutableName,maven,['target/blog-demo-0.0.1-SNAPSHOT.jar'],0,False,5,4,25
7,pom.xml,ExecutableNameNoVersion,maven,['target/blog-demo.jar'],0,False,5,4,25
8,pom.xml,project.artifactId,maven,['blog-demo'],0,False,5,2,25
9,pom.xml,project.build.plugins.plugin.org.springframewo...,maven,['spring-boot-maven-plugin'],0,False,5,0,21


**Test functions to extract and analyze options**

In [None]:
import pandas as pd
import ipytest
import json

ipytest.autoconfig()


def test_extract_options():
    
    # Load commit history data
    with open("../data/test_data/projectB_data.json", "r", encoding="utf-8") as src:
        data = json.load(src)

    # Extract options
    df_results, df_excluded = extract_options(data=data)

    # TODO


def test_analyze_options():
    # Create target_df
    target_df = pd.read_csv("../data/test_data/projectA_options.csv")

    # Create other_dfs
    other_df1 = pd.read_csv("../data/test_data/projectB_options.csv")
    other_df2 = pd.read_csv("../data/test_data/projectC_options.csv")
    other_dfs = [other_df1, other_df2]

    # Analyze configurations
    result_df = analyze_options(target_df, other_dfs)


    print(result_df.head())

    assert result_df.loc[result_df['Option'] == 'EXPOSE', 'Set globally'].iloc[0] == 2
    assert result_df.loc[result_df['Option'] == 'EXPOSE', 'Occurrences globally'].iloc[0] == 11
    assert result_df.loc[result_df['Option'] == 'EXPOSE', 'Changed globally'].iloc[0] == 3

    assert result_df.loc[result_df['Option'] == 'project.version', 'Set globally'].iloc[0] == 2
    assert result_df.loc[result_df['Option'] == 'project.version', 'Occurrences globally'].iloc[0] == 11
    assert result_df.loc[result_df['Option'] == 'project.version', 'Changed globally'].iloc[0] == 0

    assert result_df.loc[result_df['Option'] == 'server.port', 'Set globally'].iloc[0] == 2
    assert result_df.loc[result_df['Option'] == 'server.port', 'Occurrences globally'].iloc[0] == 14
    assert result_df.loc[result_df['Option'] == 'server.port', 'Changed globally'].iloc[0] == 7

ipytest.run("-vv")

platform linux -- Python 3.11.10, pytest-8.3.4, pluggy-1.5.0 -- /home/simisimon/GitHub/config-space/env/bin/python
cachedir: .pytest_cache
rootdir: /home/simisimon/GitHub/config-space/src
[1mcollecting ... [0mcollected 2 items

t_53588c30a6d546e48f6fc8efbec6a355.py::test_extract_options [32mPASSED[0m[32m                           [ 50%][0m
t_53588c30a6d546e48f6fc8efbec6a355.py::test_analyze_options [32mPASSED[0m[32m                           [100%][0m



<ExitCode.OK: 0>