In [1]:
import os
import re
import json
from os.path import exists
import pandas as pd
import subprocess
import shutil
import pydriller
from collections import defaultdict

from tqdm.notebook import tqdm
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')

#### Setup

In [2]:
react_repo = "https://github.com/facebook/react"
clone_dir = os.path.join(os.getcwd(), "react")

if not exists(clone_dir):
    with tqdm(total=100, desc="Cloning React repo", unit="chunk") as progress_bar:
        process = subprocess.Popen(
            ['git', 'clone', '--progress', react_repo, clone_dir],
            stderr=subprocess.PIPE,
            stdout=subprocess.DEVNULL,
            text=True
        )
        for line in process.stderr:
            if "Receiving objects" in line:
                percentage = int(line.split("%")[0].split()[-1])
                progress_bar.n = percentage
                progress_bar.refresh()
            elif "Resolving deltas" in line:
                progress_bar.set_description("Resolving deltas")
                progress_bar.refresh()
        process.wait()

    if process.returncode == 0:
        logging.info("cloning completed successfully")
    else:
        logging.error("error during cloning")
else:
    logging.warning("repo already cloned")



#### Task 2

In [3]:
# Mining Commit History
logging.getLogger().setLevel(logging.CRITICAL)
try:
    commits_data = []
    for commit in pydriller.Repository(clone_dir).traverse_commits():
        commit_time = commit.committer_date
        file_names = [mod.filename for mod in commit.modified_files]
        
        commits_data.append({
            "commit_time": commit_time,
            "files": file_names
        })

finally:
    logging.getLogger().setLevel(logging.INFO)

##### Temporal Coupling

In [4]:
# Analysis of Temporal Coupling
def divide_into_year_intervals(commits_data):
    """
    Helper function to dissect commits_data into yearly intervals
    """
    intervals = defaultdict(list)
    for commit in commits_data:
        year = commit["commit_time"].year
        intervals[year].append(commit)
    return intervals

# Yearly Temporal Couplings Analysis
def analyze_temporal_coupling(commits_data, time_windows):
    """
    Creates a dictionary for each year, containing another dictionary with the time_windows as keys and the corresponding coupling history.
    """
    intervals = divide_into_year_intervals(commits_data)
    yearly_temporal_couplings = defaultdict(list)

    for year, commits in intervals.items():
        temporal_couplings = {window: defaultdict(int) for window in time_windows}

        for i, commit_i in enumerate(commits):
            for j, commit_j in enumerate(commits):
                if i >= j:  # Avoid duplicated entries
                    continue

                time_diff = abs((commit_j["commit_time"] - commit_i["commit_time"]).total_seconds() / 3600)

                for window in time_windows:
                    if time_diff <= window:
                        for file1 in commit_i["files"]:
                            for file2 in commit_j["files"]:
                                if file1 != file2 and (file1.endswith(".js") and file2.endswith(".js")): # Exclusion of .md files and self-pairs -> see pdf report for further details
                                    pair = tuple(sorted([file1, file2]))
                                    temporal_couplings[window][pair] += 1

        yearly_temporal_couplings[year] = temporal_couplings

    return yearly_temporal_couplings


# Extract top 3 temporal coupling entries to JSON
def save_results_and_collect_dataframe(yearly_temporal_couplings, time_windows, output_dir="./task2/tc"):
    """
    Saves top 3 temporal coupling entries for each year into a single JSON file per year,
    including all time windows. Also returns a dataframe for analysis and pdf reporting.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    all_results = []

    for year, temporal_couplings in yearly_temporal_couplings.items():
        year_results = []
        
        for window in time_windows:
            top_coupled = sorted(
                temporal_couplings[window].items(), key=lambda x: x[1], reverse=True
            )[:3]  # Top 3 file pairs for each time window

            for pair, count in top_coupled:
                entry = {
                    "file_pair": list(pair),
                    "coupled_commits": {
                        "time_window": window,
                        "commit_count": count
                    }
                }
                year_results.append(entry)

                all_results.append({
                    "Year": year,
                    "Time Window (h)": window,
                    "File Pair": f"{pair[0]} & {pair[1]}",
                    "Commit Count": count
                })

        year_file_path = os.path.join(output_dir, f"{year}.json")
        with open(year_file_path, "w") as f:
            json.dump(year_results, f, indent=4)

    df = pd.DataFrame(all_results)
    return df


def analyze_temporal_coupling_overall(commits_data, time_windows):
    """
    Analyzes temporal coupling over the entire commit history without dissecting into yearly intervals.
    """
    temporal_couplings = {window: defaultdict(int) for window in time_windows}

    for i, commit_i in enumerate(commits_data):
        for j, commit_j in enumerate(commits_data):
            if i >= j:  # Avoid duplicated entries and self-comparison
                continue

            time_diff = abs((commit_j["commit_time"] - commit_i["commit_time"]).total_seconds() / 3600)

            for window in time_windows:
                if time_diff <= window:
                    for file1 in commit_i["files"]:
                        for file2 in commit_j["files"]:
                            if file1 != file2 and (file1.endswith(".js") and file2.endswith(".js")):  # Exclude non-JS files and self-pairs
                                pair = tuple(sorted([file1, file2]))
                                temporal_couplings[window][pair] += 1

    return temporal_couplings


# Extract top 3 temporal coupling entries to JSON
def save_overall_results(temporal_couplings, time_windows, output_file="./task2/tc/overall.json"):
    """
    Saves top 3 temporal coupling entries for the entire commit history into a JSON file,
    including all time windows. Also returns a dataframe for analysis and reporting.
    """
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))

    overall_results = []

    for window in time_windows:
        top_coupled = sorted(
            temporal_couplings[window].items(), key=lambda x: x[1], reverse=True
        )[:3]  # Top 3 file pairs for each time window

        for pair, count in top_coupled:
            entry = {
                "file_pair": list(pair),
                "coupled_commits": {
                    "time_window": window,
                    "commit_count": count
                }
            }
            overall_results.append(entry)

    with open(output_file, "w") as f:
        json.dump(overall_results, f, indent=4)

    df = pd.DataFrame([{
        "Time Window (h)": entry["coupled_commits"]["time_window"],
        "File Pair": f"{entry['file_pair'][0]} & {entry['file_pair'][1]}",
        "Commit Count": entry["coupled_commits"]["commit_count"]
    } for entry in overall_results])

    return df

In [5]:
######################
# TEMPORAL COUPLING
######################

time_windows = [24, 48, 72]

# Yearly Analysis
yearly_temporal_couplings = analyze_temporal_coupling(commits_data, time_windows)
df_temporal_coupling = save_results_and_collect_dataframe(yearly_temporal_couplings, time_windows)

# Overall Analysis
temporal_couplings = analyze_temporal_coupling_overall(commits_data, time_windows)
df_overall = save_overall_results(temporal_couplings, time_windows)

In [8]:
# Yearly Logical Couplings Analysis
def analyze_logical_coupling(commits_data):
    """
    Analyzes logical coupling by finding files frequently committed together,
    excluding self-pairs.
    """
    intervals = divide_into_year_intervals(commits_data)
    yearly_logical_couplings = defaultdict(lambda: defaultdict(int))

    for year, commits in intervals.items():
        for commit in commits:
            files = [file for file in commit["files"] if file.endswith(".js")]  # Exclude .md files -> see pdf report
            for i, file1 in enumerate(files):
                for j, file2 in enumerate(files):
                    if i < j and file1 != file2:  # Avoid duplicate pairs and self-pairs
                        pair = tuple(sorted([file1, file2]))
                        yearly_logical_couplings[year][pair] += 1

    return yearly_logical_couplings

# Extract top 3 logical coupling entries to JSON
def save_results_and_collect_dataframe(yearly_logical_couplings, output_dir="./task2/lc"):
    """
    Saves top 3 logical coupling entries for each year into a single JSON file per year.
    Also returns a dataframe for analysis and pdf reporting.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    all_results = []

    for year, logical_couplings in yearly_logical_couplings.items():
        year_results = []

        top_coupled = sorted(
            logical_couplings.items(), key=lambda x: x[1], reverse=True
        )[:3]

        for pair, count in top_coupled:
            entry = {
                "file_pair": list(pair),
                "commit_count": count
            }
            year_results.append(entry)

            all_results.append({
                "Year": year,
                "File Pair": f"{pair[0]} & {pair[1]}",
                "Commit Count": count
            })

        year_file_path = os.path.join(output_dir, f"{year}.json")
        with open(year_file_path, "w") as f:
            json.dump(year_results, f, indent=4)

    df = pd.DataFrame(all_results)
    return df


def analyze_logical_coupling_overall(commits_data):
    """
    Analyzes logical coupling over the entire commit history by finding files frequently committed together,
    excluding self-pairs.
    """
    logical_couplings = defaultdict(int)

    for commit in commits_data:
        files = [file for file in commit["files"] if file.endswith(".js")]  # Exclude non-JS files
        for i, file1 in enumerate(files):
            for j, file2 in enumerate(files):
                if i < j and file1 != file2:  # Avoid duplicate pairs and self-pairs
                    pair = tuple(sorted([file1, file2]))
                    logical_couplings[pair] += 1

    return logical_couplings

# Extract top 3 logical coupling entries to JSON
def save_overall_logical_couplings(logical_couplings, output_file="./task2/lc/overall.json"):
    """
    Saves top 3 logical coupling entries for the entire commit history into a JSON file.
    Also returns a dataframe for analysis and reporting.
    """
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))

    # Get the top 3 file pairs with the highest commit counts
    top_coupled = sorted(
        logical_couplings.items(), key=lambda x: x[1], reverse=True
    )[:3]

    overall_results = []

    for pair, count in top_coupled:
        entry = {
            "file_pair": list(pair),
            "commit_count": count
        }
        overall_results.append(entry)

    with open(output_file, "w") as f:
        json.dump(overall_results, f, indent=4)

    df = pd.DataFrame([{
        "File Pair": f"{entry['file_pair'][0]} & {entry['file_pair'][1]}",
        "Commit Count": entry["commit_count"]
    } for entry in overall_results])

    return df

In [9]:
######################
# LOGICAL COUPLING
######################

# Yearly
yearly_logical_couplings = analyze_logical_coupling(commits_data)
df_logical_coupling = save_results_and_collect_dataframe(yearly_logical_couplings)

# Overall
logical_couplings = analyze_logical_coupling_overall(commits_data)
df_overall_lc = save_overall_logical_couplings(logical_couplings)