In [None]:
from typing import Dict
from cfgnet.network.network_configuration import NetworkConfiguration
from cfgnet.network.nodes import ArtifactNode
from cfgnet.network.network import Network
from pprint import pprint
from tqdm import tqdm
from typing import List
import git
import json
import subprocess
import traceback
import glob
import time

config_file_endings = (".xml", ".yml", ".yaml", "Dockerfile", ".ini", ".properties", ".conf", ".json", ".toml", ".cfg", "settings.py", ".cnf")

def checkout_latest_commit(repo, current_branch, latest_commit):
     # Return to the latest commit
    if current_branch:
        # If we were on a branch, return to it
        repo.git.checkout(current_branch)
        print(f"Returned to original branch: {current_branch}")
    else:
        # If we were in a detached HEAD state, checkout the latest commit directly
        repo.git.checkout(latest_commit)
        print(f"Returned to the latest commit: {latest_commit}")


def analyze_config_network(repo_path: str):
    
    network_config = NetworkConfiguration(
        project_root_abs=repo_path,
        enable_static_blacklist=False,
        enable_internal_links=True,
        enable_all_conflicts=True,
        enable_file_type_plugins=True,
        system_level=False
    )

    network = Network.init_network(cfg=network_config)

    artifacts = network.get_nodes(node_type=ArtifactNode)

    config_files_data = []
    for artifact in artifacts:
        pairs = artifact.get_pairs()

        # exclude file options
        pairs = [pair for pair in pairs if pair["option"] != "file"] 

        config_files_data.append({
            "file_path": artifact.rel_file_path,
            "concept": artifact.concept_name,
            "options": len(artifact.get_pairs()),
            "pairs": pairs
        })


    config_files = set(artifact.rel_file_path for artifact in artifacts)
  	
    network_data = {
        "links": len(network.links),
        "config_files": list(config_files),
        "config_files_data": config_files_data
    }

    return network_data


def get_file_diff(repo_path: str, commit, file_path: str):
    if commit.parents:
        parent_commit = f"{commit.hexsha}^"
            
        try:                        
            # Run git diff to capture line-by-line changes
            diff_output = subprocess.check_output(
                ['git', 'diff', parent_commit, commit.hexsha, '--', file_path],
                cwd=repo_path,
                text=True
            )
            return diff_output
        except (subprocess.CalledProcessError, git.exc.GitCommandError) as e:
            print(f"Error running git diff for commit {commit.hexsha}: {e}")
            return None


def analyze_repository(repo_path: str, get_diff: bool = False) -> Dict:
    """Analyze Commit history of repositories and collect stats about the configuration space."""  
    start_time = time.time()
    project_name = repo_path.split("/")[-1]
    repo = git.Repo(repo_path)

    # Save the current branch to return to it later
    current_branch = repo.active_branch.name if not repo.head.is_detached else None
    latest_commit = repo.head.commit.hexsha
    parent_commit = None

    # Get all commits in the repository from oldest to newest
    commits = list(repo.iter_commits("HEAD"))[::-1]

    print(f"Number of commits: {len(commits)}")

    config_commit_data = []

    for commit in tqdm(commits, desc="Processing", total=len(commits)):

        is_config_related = False

        # Get commit stats
        stats = commit.stats.total

        # Checkout the commit
        repo.git.checkout(commit.hexsha)

        # check if commit is config-related
        if any(file_path.endswith(config_file_endings) for file_path in commit.stats.files.keys()):
            is_config_related = True
            
            # Run the external analysis for config-related commits
            try: 
                network_data = analyze_config_network(repo_path=repo_path)
            except Exception:
                print(f"Error occurred in commit {commit.hexsha}")
                print({traceback.print_exc()})
                return

            # Get general stats per config file
            for file_path, file_stats in commit.stats.files.items():
                
                # Get config file data
                if file_path in network_data["config_files"]:
                    file_data = next(filter(lambda x: x["file_path"] == file_path, network_data["config_files_data"]))
                    file_data["insertions"] = file_stats['insertions']
                    file_data["deletions"] = file_stats['deletions']
                    file_data["total_changes"] = file_stats['insertions'] + file_stats['deletions']

                    # Get config file diff
                    if get_diff:
                        diff_output = get_file_diff(
                            repo_path=repo_path,
                            commit=commit,
                            file_path=file_path
                        )

                        file_data["diff"] = diff_output

            config_commit_data.append(
                {   
                    "commit_hash": str(commit.hexsha),
                    "parent_commit": (parent_commit),
                    "is_config_related": is_config_related,
                    "author": f"{commit.author.name} <{commit.author.email}>",
                    "commit_mgs": str(commit.message),
                    "files_changed": stats['files'],
                    "insertions": stats['insertions'],
                    "deletions": stats['deletions'],
                    "network_data": network_data
                }
            )
        
        else:
            config_commit_data.append(
                {   
                    "commit_hash": str(commit.hexsha),
                    "parent_commit": (parent_commit),
                    "is_config_related": is_config_related,
                    "author": f"{commit.author.name} <{commit.author.email}>",
                    "commit_mgs": str(commit.message),
                    "files_changed": stats['files'],
                    "insertions": stats['insertions'],
                    "deletions": stats['deletions'],
                    "network_data": None
                }
            )


    # Return to latest commit
    checkout_latest_commit(
        repo=repo, 
        current_branch=current_branch,
        latest_commit=latest_commit
    )

    print(f"Len commit data: {len(config_commit_data)}, {round(len(config_commit_data)/len(commits), 2)}")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time:.6f} seconds")
    
    return {
        "project_name": project_name,
        "analysis_time": elapsed_time,
        "len_commits": len(commits),
        "config_commit_data": config_commit_data
    }

In [None]:
project_path = "/home/simisimon/GitHub/projects/flutter"

commit_data = analyze_repository(repo_path=project_path, get_diff=True)

output_file = "../data/test_data/analyzed_projects/test_project.json"

print(f"Write commit data into file {output_file}")
with open(output_file, "w", encoding="utf-8") as dest:
    json.dump(commit_data, dest, indent=2)


In [None]:
project_dir = "/home/simisimon/GitHub/cfgnet_evaluation"

for project_path in glob.glob(project_dir + "/**"):
    project_name = project_path.split("/")[-1]
    
    commit_data = analyze_repository(repo_path=project_path, get_diff=True)

    output_file = f"../data/analyzed_projects/{project_name}.json"

    print(f"Write commit data into file {output_file}")
    with open(output_file, "w", encoding="utf-8") as dest:
        json.dump(commit_data, dest, indent=2)