# Software Information Entropy

In [2]:
import math
import pathlib
import shutil
import tempfile
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Dict, List

import git
import pandas as pd
import pygit2

# read example data which includes pubmed github links detected from article abstracts
pd.read_parquet("../../../tests/data/examples/pubmed/pubmed_github_links.parquet")

ModuleNotFoundError: No module named 'git'

### Lines of Code Changed

In [None]:
def calculate_loc_changes(
    repo_path: pathlib.Path, source: str, target: str, file_names: List[str]
) -> Dict[str, int]:
    """
    Finds the total number of code lines changed for each specified file between two commits.

    Args:
        repo_path (pathlib.Path): The path to the git repository.
        source (str): The source commit hash.
        target (str): The target commit hash.
        file_names (List[str]): List of file names to calculate changes for.

    Returns:
        Dict[str, int]: A dictionary where the key is the filename, and the value is the lines changed (added and removed).
    """
    repo = pygit2.Repository(str(repo_path))
    source_commit = repo.revparse_single(source)
    target_commit = repo.revparse_single(target)

    changes = {}
    diff = repo.diff(source_commit, target_commit)

    for patch in diff:
        if patch.delta.new_file.path in file_names:
            additions = 0
            deletions = 0
            for hunk in patch.hunks:
                for line in hunk.lines:
                    if line.origin == "+":
                        additions += 1
                    elif line.origin == "-":
                        deletions += 1
            lines_changed = additions + deletions
            changes[patch.delta.new_file.path] = lines_changed

    return changes

### Normalized Entropy Calculation 

In [None]:
def calculate_normalized_entropy(
    repo_path: pathlib.Path,
    source_commit: str,
    target_commit: str,
    file_names: list[str],
) -> dict[str, float]:
    """
    Calculates the entropy of changes in specified files between two commits,
    inspired by Shannon's entropy formula. Normalized relative to the total lines
    of code changes across specified files.

    Args:
        repo_path (str): The file path to the git repository.
        source_commit (str): The git hash of the source commit.
        target_commit (str): The git hash of the target commit.
        file_names (list[str]): List of file names to calculate entropy for.

    Returns:
        dict[str, float]: A dictionary mapping file names to their calculated entropy.

    Application of Entropy Calculation:
        Entropy measures the uncertainty in a given system. Calculating the entropy
        of lines of code (LoC) changed reveals the variability and complexity of
        modifications in each file. Higher entropy values indicate more unpredictable
        changes, helping identify potentially unstable code areas.

    """
    loc_changes = calculate_loc_changes(
        repo_path, source_commit, target_commit, file_names
    )

    # Calculate total lines of code changes across all specified files
    total_changes = sum(loc_changes.values())

    # Calculate the entropy for each file, relative to total changes
    entropy_calculation = {
        file_name: (
            -(
                (loc_changes[file_name] / total_changes)
                * math.log2(
                    loc_changes[file_name] / total_changes
                )  # Entropy Calculation
            )
            if loc_changes[file_name] != 0
            and total_changes
            != 0  # Avoid division by zero and ensure valid entropy calculation
            else 0.0
        )
        for file_name in loc_changes  # Iterate over each file in loc_changes dictionary
    }
    # Calculate total entropy
    total_entropy = sum(entropy_calculation.values())

    # Normalize total entropy to range [0, 1]
    max_entropy = len(loc_changes)
    normalized_total_entropy = total_entropy / max_entropy

    return normalized_total_entropy

### Proccess Repository

In [None]:
def process_repository(repo_url: str) -> (float, str, str, int):
    temp_dir = tempfile.mkdtemp()

    try:
        # Clone the repository into the temporary directory
        repo = git.Repo.clone_from(repo_url, temp_dir)
        repo_path = pathlib.Path(temp_dir)

        # Get the main branch name
        default_branch = repo.active_branch.name

        # Get the list of commits on the main branch
        commits = list(repo.iter_commits(default_branch))

        # Get the first and most recent commits
        first_commit = commits[-1]
        most_recent_commit = commits[0]

        # Calculate the total existence time of the repository in days
        time_of_existence = (
            most_recent_commit.committed_datetime - first_commit.committed_datetime
        ).days

        # Find the dates of the first and most recent commits
        first_commit_date = first_commit.committed_datetime.date().isoformat()
        most_recent_commit_date = (
            most_recent_commit.committed_datetime.date().isoformat()
        )

        # Find all files that have been edited in the repository
        file_names = set()
        for commit in commits:
            # Diff each commit against its parent to find edited files
            for diff in commit.diff(None):
                if diff.a_path:
                    file_names.add(diff.a_path)
                if diff.b_path:
                    file_names.add(diff.b_path)
        file_names = list(file_names)

        # Calculate the total normalized entropy for the repository
        normalized_total_entropy = calculate_normalized_entropy(
            repo_path, first_commit.hexsha, most_recent_commit.hexsha, file_names
        )
        return (
            normalized_total_entropy,
            first_commit_date,
            most_recent_commit_date,
            time_of_existence,
        )

    finally:
        shutil.rmtree(temp_dir)

### Respository Analysis

In [None]:
def repository_analysis():
    df = pd.read_parquet(
        "../../../tests/data/examples/pubmed/pubmed_github_links.parquet"
    )

    # Slice dataframe to allow for a testing subset
    repo_urls = df["github_link"].iloc[0:3000].tolist()

    # Using ProcessPoolExecutor for parallel processing with up to 16 workers
    with ProcessPoolExecutor(max_workers=16) as executor:
        results = []
        repo_count = 0

        futures = {
            executor.submit(process_repository, repo_url): repo_url
            for repo_url in repo_urls
        }
        for future in as_completed(futures):
            repo_count += 1
            repo_url = futures[future]
            try:
                (
                    normalized_total_entropy,
                    first_commit_date,
                    most_recent_commit_date,
                    time_of_existence,
                ) = future.result()
                print(f"Repository {repo_count} {repo_url}")
                results.append(
                    [
                        repo_url,
                        normalized_total_entropy,
                        first_commit_date,
                        most_recent_commit_date,
                        time_of_existence,
                    ]
                )
            except Exception:
                results.append([repo_url, None, None, None, None])
    # Create DataFrame
    df_results = pd.DataFrame(
        results,
        columns=[
            "Repository URL",
            "Normalized Total Entropy",
            "Date of First Commit",
            "Date of Last Commit",
            "Time of Existence (days)",
        ],
    )
    # COnvert DF to parquet file
    df_results.to_parquet("repository_analysis_results.parquet")

    return df_results


df_results = repository_analysis()

Repository 1 https://github.com/Fibonaccirabbit/cVAN
Repository 2 https://github.com/gangwug/MetaCycle


Pumping 'stdout' of cmd(['git', 'diff', '81a830fc164a2bda7d543bbdd4f859483ddde0cb', '--abbrev=40', '--full-index', '-M', '--raw', '-z', '--no-color']) failed due to: UnicodeDecodeError('utf-8', b':000000 100644 0000000000000000000000000000000000000000 624dc821a4f13e806803e0d162eaa6ca7db9a893 A\x00.github/workflows/maven.yml\x00:100644 100644 f526915f4867d5081b402ac95882bd9e0fde176f 6248468fcd561400ddbabc390e1e1c0b849d9968 M\x00.gitignore\x00:000000 100644 0000000000000000000000000000000000000000 a845421ad850430100b32ef34cd8981838ecc322 A\x00.gitlab-ci.yml\x00:000000 100644 0000000000000000000000000000000000000000 e64d586635030749ec27a02762177c431a2f1f71 A\x00.m2/settings.xml\x00:100644 100644 e64572b8c8791ef575652dbc844a9ec03f568139 0cb69c923fab0f531af9cdce2e795aa477c6e56c M\x00README.md\x00:100644 000000 a9b5afe4a5b063e7a6eb96879783debf7e3923c8 0000000000000000000000000000000000000000 D\x00ols-client.iml\x00:100644 100644 1771d1d11d329ecda82e8d0b8b5b3da47d96c282 70bdd3d60bfb9f1dfc83fc

Repository 3 https://github.com/PRIDE-Utilities/ols-client
Repository 4 https://github.com/xz-stjude/idcov
Repository 5 https://github.com/PRIDE-Toolsuite/ols-dialog
Repository 6 https://github.com/yang-lina/seqminer
Repository 7 https://github.com/tripal/tripal_elasticsearch
Repository 8 https://github.com/zhanxw/seqminer
Repository 9 https://github.com/hakyimlab/MetaXcan
Repository 10 https://github.com/tripal/tripal_analysis_expression
