# Software Information Entropy

In [9]:
import math
import pathlib
import shutil
import tempfile
from concurrent.futures import ProcessPoolExecutor, as_completed

import git
import pandas as pd
import plotly.express as px

# read example data which includes pubmed github links detected from article abstracts
pd.read_parquet("../../../tests/data/examples/pubmed/pubmed_github_links.parquet")

Unnamed: 0,PMID,article_date,title,authors,github_link
0,29409532,2018-02-06,SCANPY: large-scale single-cell gene expressio...,"Wolf, Angerer, Theis",https://github.com/theislab/Scanpy
1,29409532,2018-02-06,SCANPY: large-scale single-cell gene expressio...,"Wolf, Angerer, Theis",https://github.com/theislab/anndata
2,35814290,2022-03-28,WormBase single-cell tools.,"da Veiga Beltrame, Arnaboldi, Sternberg",https://github.com/WormBase/scdefg
3,35814290,2022-03-28,WormBase single-cell tools.,"da Veiga Beltrame, Arnaboldi, Sternberg",https://github.com/WormBase/wormcells-viz
4,37096789,,Beginner's Guide on the Use of PAML to Detect ...,"Álvarez-Carretero, Kapli, Yang",https://github.com/abacus-gene/paml
...,...,...,...,...,...
10237,37397020,2023-06-26,Multi-Object Tracking in Heterogeneous environ...,"Rathore, Sharma, Shah, Sharma, Torney, Guttal",https://github.com/tee-lab/MOTHe-GUI
10238,37387130,,SpatialSort: a Bayesian model for clustering a...,"Lee, Chern, Nissen, Wang, , Huang, Gandhi, Bou...",https://github.com/Roth-Lab/SpatialSort
10239,37382572,,Robust joint clustering of multi-omics single-...,"Jiang, Zhan, Ching, Chen",https://github.com/jianghruc/scHoML
10240,37369035,,Hierarchical graph transformer with contrastiv...,"Gu, Luo, Chen, Deng, Lai",https://github.com/ZhonghuiGu/HEAL


### Lines of Code Changed

In [10]:
def calculate_loc_changes(
    repo_path: pathlib.Path, source: str, target: str, file_names: list[str]
) -> dict[str, int]:
    """
    Finds the total number of code lines changed for each specified file between two commits.

    Args:
        repo_path (pathlib.Path): The path to the git repository.
        source (str): The source commit hash.
        target (str): The target commit hash.
        file_names (list[str]): List of file names to calculate changes for.

    Returns:
        dict[str, int]: A dictionary where the key is the filename, and the value is the lines changed (added and removed).
    """
    repo = git.Repo(repo_path)
    changes = {}

    for file_name in file_names:
        # Get the diff output for the file between the two commits
        diff_output = repo.git.diff(source, target, "--numstat", "--", file_name)
        lines_changed = 0
        for line in diff_output.splitlines():
            diff_line = line.split()
            # Check if the line has exactly three parts: (addded,removed,file_path) and that the first two are numeric
            if (
                len(diff_line) == 3
                and diff_line[0].isdigit()
                and diff_line[1].isdigit()
            ):
                added, removed, _ = diff_line
                lines_changed += int(added) + int(removed)

        changes[file_name] = lines_changed

    return changes

### Normalized Entropy Calculation 

In [11]:
def calculate_normalized_entropy(
    repo_path: pathlib.Path,
    source_commit: str,
    target_commit: str,
    file_names: list[str],
) -> dict[str, float]:
    """
    Calculates the entropy of changes in specified files between two commits,
    inspired by Shannon's entropy formula. Normalized relative to the total lines
    of code changes across specified files.

    Args:
        repo_path (str): The file path to the git repository.
        source_commit (str): The git hash of the source commit.
        target_commit (str): The git hash of the target commit.
        file_names (list[str]): List of file names to calculate entropy for.

    Returns:
        dict[str, float]: A dictionary mapping file names to their calculated entropy.

    Application of Entropy Calculation:
        Entropy measures the uncertainty in a given system. Calculating the entropy
        of lines of code (LoC) changed reveals the variability and complexity of
        modifications in each file. Higher entropy values indicate more unpredictable
        changes, helping identify potentially unstable code areas.

    """
    loc_changes = calculate_loc_changes(
        repo_path, source_commit, target_commit, file_names
    )

    # Calculate total lines of code changes across all specified files
    total_changes = sum(loc_changes.values())

    # Calculate the entropy for each file, relative to total changes
    entropy_calculation = {
        file_name: (
            -(
                (loc_changes[file_name] / total_changes)
                * math.log2(
                    loc_changes[file_name] / total_changes
                )  # Entropy Calculation
            )
            if loc_changes[file_name] != 0
            and total_changes
            != 0  # Avoid division by zero and ensure valid entropy calculation
            else 0.0
        )
        for file_name in loc_changes  # Iterate over each file in loc_changes dictionary
    }
    # Calculate total entropy
    total_entropy = sum(entropy_calculation.values())

    # Normalize total entropy to range [0, 1]
    max_entropy = len(loc_changes)
    normalized_total_entropy = total_entropy / max_entropy

    return normalized_total_entropy

### Proccess Repository

In [12]:
def process_repository(repo_url: str) -> (float, int):
    temp_dir = tempfile.mkdtemp()

    try:
        # Clone the repository
        repo = git.Repo.clone_from(repo_url, temp_dir)
        repo_path = pathlib.Path(temp_dir)

        # Get the default branch
        default_branch = repo.active_branch.name

        # Get the first and most recent commits
        commits = list(repo.iter_commits(default_branch))
        first_commit = commits[-1]
        most_recent_commit = commits[0]

        # Calculate the total existence time (in days)
        time_of_existence = (
            most_recent_commit.committed_datetime - first_commit.committed_datetime
        ).days

        # Find all files that have been edited
        file_names = set()
        for commit in commits:
            for diff in commit.diff(None):
                if diff.a_path:
                    file_names.add(diff.a_path)
                if diff.b_path:
                    file_names.add(diff.b_path)
        file_names = list(file_names)

        # Calculate the total normalized entropy
        normalized_total_entropy = calculate_normalized_entropy(
            repo_path, first_commit.hexsha, most_recent_commit.hexsha, file_names
        )
        return normalized_total_entropy, time_of_existence

    finally:
        # Delete the cloned repository
        shutil.rmtree(temp_dir)

### Respository Analysis and Visualization

In [13]:
# from src.almanack.entropy import calculate_normalized_entropy
# from src.almanack.git_parser import calculate_loc_changes


def repository_analysis():
    df = pd.read_parquet(
        "../../../tests/data/examples/pubmed/pubmed_github_links.parquet"
    )

    # Slice dataframe to allow for a testing subset
    repo_urls = df["github_link"].iloc[300:304].tolist()

    # Using ProcessPoolExecutor for parallel processing with up to 16 workers
    with ProcessPoolExecutor(max_workers=16) as executor:
        entropies = []
        times = []

        # Submit tasks to executor and collect futures for result retrieval
        futures = {
            executor.submit(process_repository, repo_url): repo_url
            for repo_url in repo_urls
        }

        # Iterate through completed futures as they become available
        for future in as_completed(futures):
            repo_url = futures[future]
            try:
                # Retrieve result or raise exception
                normalized_total_entropy, time_of_existence = future.result(timeout=5)
                entropies.append(normalized_total_entropy)
                times.append(time_of_existence)
                print(
                    f"Repository: {repo_url}, Normalized Total Entropy: {normalized_total_entropy:.4f}, Time of Existence: {time_of_existence} days"
                )
            except TimeoutError:
                print(f"Repository {repo_url} took longer than 5 seconds to process.")
            except Exception as e:
                print(f"Failed to process repository {repo_url}: {e}")

    return times, entropies


# Execute the function to get data
times, entropies = repository_analysis()

# Using Plotly to visualize results
fig = px.scatter(
    x=times,
    y=entropies,
    labels={
        "x": "Date of First commit - Publication Date",
        "y": "Normalized Total Entropy",
    },
    title="PubMed Repository Entropy vs. Total Duration",
)

fig.update_traces(marker=dict(size=6, opacity=0.8, line=dict(width=1, color="Black")))
fig.update_layout(
    title_font_size=25,
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    title_x=0.5,  # Center title
    margin=dict(l=20, r=20, t=50, b=20),
    height=600,
    width=1000,
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor="LightGray")
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor="LightGray")
fig.update_traces(marker=dict(size=5, opacity=0.6))

fig.show()

Repository: https://github.com/jtchavisIII/CU-MSDSp, Normalized Total Entropy: 0.1576, Time of Existence: 245 days
Repository: https://github.com/DominikBuchner/apscale, Normalized Total Entropy: 0.0831, Time of Existence: 918 days
Repository: https://github.com/SunXQlab/EnDecon, Normalized Total Entropy: 0.0583, Time of Existence: 107 days
Repository: https://github.com/ollisa/BELMM, Normalized Total Entropy: 0.0536, Time of Existence: 134 days
