In [None]:
from typing import Dict
from cfgnet.network.network_configuration import NetworkConfiguration
from cfgnet.network.nodes import ArtifactNode
from cfgnet.network.network import Network
from pprint import pprint
from tqdm import tqdm
import git
import json
import subprocess
import traceback
import glob
import time

config_file_endings = (".xml", ".yml", ".yaml", "Dockerfile", ".ini", ".properties", ".conf", ".json", ".toml", ".cfg", "settings.py", ".cnf")

def checkout_latest_commit(repo, current_branch, latest_commit):
     # Return to the latest commit
    if current_branch:
        # If we were on a branch, return to it
        repo.git.checkout(current_branch)
        print(f"Returned to original branch: {current_branch}")
    else:
        # If we were in a detached HEAD state, checkout the latest commit directly
        repo.git.checkout(latest_commit)
        print(f"Returned to the latest commit: {latest_commit}")


def analyze_config_network(repo_path: str):
    
    network_config = NetworkConfiguration(
        project_root_abs=repo_path,
        enable_static_blacklist=False,
        enable_internal_links=True,
        enable_all_conflicts=True,
        enable_file_type_plugins=True,
        system_level=False
    )

    network = Network.init_network(cfg=network_config)

    artifacts = network.get_nodes(node_type=ArtifactNode)

    config_files_data = []
    for artifact in artifacts:
        pairs = artifact.get_pairs()

        config_files_data.append({
            "file_path": artifact.rel_file_path,
            "concept": artifact.concept_name,
            "options": len(artifact.get_pairs()),
            "pairs": pairs
        })


    config_files = set(artifact.rel_file_path for artifact in artifacts)
  	
    network_data = {
        "links": len(network.links),
        "config_files": list(config_files),
        "config_files_data": config_files_data
    }

    return network_data


def get_file_diff(repo_path: str, commit, file_path: str):
    if commit.parents:
        parent_commit = f"{commit.hexsha}^"
            
        try:                        
            # Run git diff to capture line-by-line changes
            diff_output = subprocess.check_output(
                ['git', 'diff', parent_commit, commit.hexsha, '--', file_path],
                cwd=repo_path,
                text=True
            )
            return diff_output
        except (subprocess.CalledProcessError, git.exc.GitCommandError) as e:
            print(f"Error running git diff for commit {commit.hexsha}: {e}")
            return None


def analyze_repository(repo_path: str, get_diff: bool = False) -> Dict:
    """Analyze Commit history of repositories and collect stats about the configuration space."""  
    start_time = time.time()
    project_name = repo_path.split("/")[-1]
    repo = git.Repo(repo_path)

    # Save the current branch to return to it later
    current_branch = repo.active_branch.name if not repo.head.is_detached else None
    latest_commit = repo.head.commit.hexsha
    parent_commit = None

    # Get all commits in the repository from oldest to newest
    commits = list(repo.iter_commits("HEAD"))[::-1]

    print(f"Number of commits: {len(commits)}")

    config_commit_data = []

    for commit in tqdm(commits, desc="Processing", total=len(commits)):

        is_config_related = False

        # Get commit stats
        stats = commit.stats.total

        # Checkout the commit
        repo.git.checkout(commit.hexsha)

        # check if commit is config-related
        if any(file_path.endswith(config_file_endings) for file_path in commit.stats.files.keys()):
            is_config_related = True
            
            # Run the external analysis for config-related commits
            try: 
                network_data = analyze_config_network(repo_path=repo_path)
            except Exception:
                print(f"Error occurred in commit {commit.hexsha}")
                print({traceback.print_exc()})
                return

            # Get general stats per config file
            for file_path, file_stats in commit.stats.files.items():
                
                # Get config file data
                if file_path in network_data["config_files"]:
                    file_data = next(filter(lambda x: x["file_path"] == file_path, network_data["config_files_data"]))
                    file_data["insertions"] = file_stats['insertions']
                    file_data["deletions"] = file_stats['deletions']
                    file_data["total_changes"] = file_stats['insertions'] + file_stats['deletions']

                    # Get config file diff
                    if get_diff:
                        diff_output = get_file_diff(
                            repo_path=repo_path,
                            commit=commit,
                            file_path=file_path
                        )

                        file_data["diff"] = diff_output

            config_commit_data.append(
                {   
                    "commit_hash": str(commit.hexsha),
                    "parent_commit": (parent_commit),
                    "is_config_related": is_config_related,
                    "author": f"{commit.author.name} <{commit.author.email}>",
                    "commit_mgs": str(commit.message),
                    "files_changed": stats['files'],
                    "insertions": stats['insertions'],
                    "deletions": stats['deletions'],
                    "network_data": network_data
                }
            )
        
        else:
            config_commit_data.append(
                {   
                    "commit_hash": str(commit.hexsha),
                    "parent_commit": (parent_commit),
                    "is_config_related": is_config_related,
                    "author": f"{commit.author.name} <{commit.author.email}>",
                    "commit_mgs": str(commit.message),
                    "files_changed": stats['files'],
                    "insertions": stats['insertions'],
                    "deletions": stats['deletions'],
                    "network_data": None
                }
            )


    # Return to latest commit
    checkout_latest_commit(
        repo=repo, 
        current_branch=current_branch,
        latest_commit=latest_commit
    )

    print(f"Len commit data: {len(config_commit_data)}, {round(len(config_commit_data)/len(commits), 2)}")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time:.6f} seconds")
    
    return {
        "project_name": project_name,
        "analysis_time": elapsed_time,
        "len_commits": len(commits),
        "config_commit_data": config_commit_data
    }

In [None]:
project_dir = "/home/simisimon/GitHub/cfgnet_evaluation"

for project_path in glob.glob(project_dir + "/**"):
    project_name = project_path.split("/")[-1]
    
    commit_data = analyze_repository(repo_path=project_path, get_diff=True)

    output_file = f"../data/analyzed_projects/{project_name}.json"

    print(f"Write commit data into file {output_file}")
    with open(output_file, "w", encoding="utf-8") as dest:
        json.dump(commit_data, dest, indent=2)

**Extract all options and collect all their values across the commit history**

Problems (TODO)
- options in config files that appear multiple time, such as COPY/ADD/RUN/FROM in Dockerfile
- there is no way to reliably track each option seperately
- therefore we currently exclude such options

Definition of columns
- `Changed internally` is an integer, indicating how often the value of an option was changes in the project
- `Removed` is a boolean, indicating if an option has been removed at some point 

In [44]:
import pandas as pd
from typing import List

def extract_options(data: List):

    project_name = data["project_name"]
    print(f"Extract all options and their values from {project_name}.")

    # Extract configuration options and their values, excluding duplicates
    config_data = []
    excluded_pairs = set()
    option_presence_tracker = {}  # Track presence across commits

    for commit in data["config_commit_data"]:
        if commit["is_config_related"]:
            commit_hash = commit["commit_hash"]
            for file_data in commit["network_data"]["config_files_data"]:
                # Dictionary to track option occurrences in the current file
                option_tracker = {}
                for pair in file_data["pairs"]:
                    key = (file_data["file_path"], pair["option"])
                    
                    if key not in option_tracker:
                        option_tracker[key] = []
                    option_tracker[key].append(pair)
                    
                    # Update the presence tracker
                    if key not in option_presence_tracker:
                        option_presence_tracker[key] = {"last_seen": commit_hash, "removed": False}
                    else:
                        option_presence_tracker[key]["last_seen"] = commit_hash
                        option_presence_tracker[key]["removed"] = False  # Mark as seen in this commit

                # Add only options that appear once in the file
                for key, occurrences in option_tracker.items():
                    if len(occurrences) == 1:  # Include only unique options
                        pair = occurrences[0]
                        config_data.append({
                            "file_path": file_data["file_path"],
                            "option": pair["option"],
                            "value": pair["value"],
                            "type": pair["type"],
                            "concept": file_data["concept"]
                        })
                    else:
                        pair = occurrences[0]
                        excluded_pairs.add((file_data["file_path"], pair["option"], file_data["concept"]))

    # After processing all commits, check for removed options
    for key, data in option_presence_tracker.items():
        if data["last_seen"] != commit_hash:  # If not seen in the last commit, mark as removed
            option_presence_tracker[key]["removed"] = True

    # Create DataFrame from the extracted data
    df = pd.DataFrame(config_data)

    df_excluded = pd.DataFrame(list(excluded_pairs))

    # store excludes options only if dataframe is not empty
    if not df_excluded.empty:
        df_excluded.columns = ["File", "Option", "Concept"]
        df_excluded.to_csv(f"../data/excluded_options/{project_name}_excluded.csv", index=False)

    # Group by option, type, and file_path, and aggregate unique values
    aggregated_df = (
        df.groupby(['file_path', 'option', 'concept'])['value']
        .apply(lambda x: sorted(list(set(x))))
        .reset_index()
    )

    # Rename columns for clarity
    aggregated_df.columns = ['File Path', 'Option', 'Concept', 'Values']

    # Add and 'changed internally' columns
    aggregated_df['Changed internally'] = aggregated_df['Values'].apply(lambda x: len(x) - 1 if len(x) > 1 else 0)

    # Add 'removed' column by checking the option presence tracker
    removed_status = []
    for _, row in aggregated_df.iterrows():
        key = (row['File Path'], row['Option'])
        removed_status.append(option_presence_tracker.get(key, {}).get('removed', False))

    aggregated_df['Removed'] = removed_status

    aggregated_df.to_csv(f"../data/extracted_options/{project_name}_options.csv", index=False)



In [45]:
data_file = "../data/analyzed_projects/spring-boot-blog.json"

with open(data_file, "r", encoding="utf-8") as src:
    data = json.load(src)

extract_options(data=data)

Extract all options and their values from spring-boot-blog.


In [46]:
import glob
import json

analyzed_project_dir = "../data/analyzed_projects"

for project_path in glob.glob(analyzed_project_dir + "/**"):
    with open(project_path, "r", encoding="utf-8") as src:
        data = json.load(src)

    extract_options(data=data)

Extract all options and their values from mentorship-platform.
Extract all options and their values from netflix-oss-example.
Extract all options and their values from piggymetrics.
Extract all options and their values from spring-boot-blog.
Extract all options and their values from taskManagement.
Extract all options and their values from test_project_history.
Extract all options and their values from Ward.


**Extract if an options was set in other projects if the option was changed**

Definitions of columns
- `Changed globally` is an integer, indicating if an option was changed in other projects
- `Set globally` is an integer, indicating the number of projects in which the option exists
- `Occurrences globally` is an integer, indicating how often the option occurs across all projects

In [49]:
import os

data_dir = "../data/extracted_options"

# Load all CSV files from the directory into a dictionary of DataFrames
repository_files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
repository_dataframes = {file: pd.read_csv(os.path.join(data_dir, file)) for file in repository_files}

# Function to analyze options in a target file against all other files
def analyze_options_in_target(target_df, other_dfs):
    # Initialize columns
    target_df['Set globally'] = 0
    target_df['Changed globally'] = 0
    target_df['Occurrences globally'] = 0


    for index, row in target_df.iterrows():
        option = row['Option']
        values = set()  # To track unique values for "Changed globally"

        for other_df in other_dfs:
            # Count occurrences of the option in the other DataFrame
            option_count = other_df['Option'].value_counts().get(option, 0)

            # Increment "Set globally" (if the option exists)
            if option_count > 0:
                target_df.loc[index, 'Set globally'] += 1

            # Increment "Occurrences globally" by the count in this DataFrame
            target_df.loc[index, 'Occurrences globally'] += option_count

            # Collect unique values for "Changed globally"
            values.update(other_df.loc[other_df['Option'] == option, 'Values'].explode())

        # Set the count of unique values for "Changed globally"
        target_df.loc[index, 'Changed globally'] = len(values)

    return target_df

# Pick one target file to analyze (adjust file name as needed)
target_file_name = 'spring-boot-blog_options.csv'
target_df = repository_dataframes[target_file_name]

# Use all other files as comparison
other_dfs = [df for name, df in repository_dataframes.items() if name != target_file_name]

# Perform the analysis
updated_target_df = analyze_options_in_target(target_df.copy(), other_dfs)

updated_target_df


Unnamed: 0,File Path,Option,Concept,Values,Changed internally,Removed,Set globally,Changed globally,Occurrences globally
0,.mvn/wrapper/maven-wrapper.properties,distributionurl,configparser,['https://repo1.maven.org/maven2/org/apache/ma...,0,False,3,2,7
1,.mvn/wrapper/maven-wrapper.properties,file,configparser,['.mvn/wrapper/maven-wrapper.properties'],0,False,6,129,136
2,docker/Dockerfile,COPY.dest,docker,['$APP_HOME/app.jar'],0,False,1,1,1
3,docker/Dockerfile,COPY.src,docker,['target/blog-demo-0.0.1-SNAPSHOT.jar'],0,False,1,1,1
4,docker/Dockerfile,ENTRYPOINT.exec_command,docker,['exec java -jar app.jar'],0,False,3,3,3
...,...,...,...,...,...,...,...,...,...
69,src/main/resources/application.properties,spring.h2.console.path,spring,['/h2-console'],0,False,0,0,0
70,src/main/resources/application.properties,spring.queries.roles-query,spring,"['select u.username, r.role from user u inner ...",0,False,0,0,0
71,src/main/resources/application.properties,spring.queries.users-query,spring,"['select username, password, active from user ...",0,False,0,0,0
72,src/main/resources/application.properties,spring.thymeleaf.cache,spring,['false'],0,False,0,0,0
