In [19]:
!pip install gitpython



In [20]:
from google.colab import drive
import git
import os
import pandas as pd

In [21]:
# Mount Google Drive
drive.mount('/content/drive')

# Wait until Google Drive is mounted
while not os.path.exists('/content/drive/My Drive/'):
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
# Drive folder to save the file
folder_path = '/content/drive/My Drive/IMT Atlantique/Semester 2/Project Complex/Coding/salsa.debian.org/'

# Create the folder if it doesn't exist
#os.makedirs(folder_path, exist_ok=True)

Clone repositories from salsa.debian.org

In [23]:
# Function to get the size of a directory
def get_directory_size(path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    return total_size

In [24]:
# Function to count the number of commits in a repository
def count_commits(repo_path):
    repo = git.Repo(repo_path)
    return len(list(repo.iter_commits()))

Fetch commit history

In [25]:
# Function to fetch commit history
def fetch_commit_history(repo_path):
    repo = git.Repo(repo_path)
    commits = repo.iter_commits()
    commit_history = []

    for commit in commits:
        commit_info = {
            "commit_id": commit.hexsha,
            "author": commit.author.name,
            "email": commit.author.email,
            "timestamp": commit.authored_datetime,
            "message": commit.message
        }
        commit_history.append(commit_info)

    return commit_history

In [26]:
# Define repository paths
repository_paths = [
    folder_path + "/repos/dpkg",
    folder_path + "/repos/glibc",
    folder_path + "/repos/systemd",
    folder_path + "/repos/apt"
    #"/content/ubuntu-dev-tools"
]

In [27]:
# List of repositories to clone
repositories = [
    "https://salsa.debian.org/dpkg-team/dpkg.git",
    "https://salsa.debian.org/glibc-team/glibc.git",
    "https://salsa.debian.org/systemd-team/systemd.git",
    "https://salsa.debian.org/apt-team/apt.git"
    #"https://salsa.debian.org/debian/ubuntu-dev-tools.git"
]

# Create dataframes for each repository
repo_dataframes = {}

# Fetch commit history for each repository
for repo_url in repositories:
    # Extract repository name from URL
    repo_name = repo_url.split("/")[-1].replace(".git", "")
    repo_path = f"{folder_path}/repos/{repo_name}"

    # Check if the repository directory exists
    if not os.path.exists(repo_path):
        # Clone repository
        git.Repo.clone_from(repo_url, repo_path)
        print(f"Cloned {repo_name}")
    else:
        print(f"Repository {repo_name} already exists, skipping cloning.")

      # Initialize variables
    repo_size = 0
    commits_count = 0
    commit_history = []

    try:
        # Get repository size
        repo_size = get_directory_size(repo_path)

        # Count commits
        commits_count = count_commits(repo_path)

        # Fetch commit history
        commit_history = fetch_commit_history(repo_path)
    except git.exc.InvalidGitRepositoryError:
        print(f"Invalid Git repository: {repo_name}")

    # Create dataframe
    df = pd.DataFrame(commit_history)
    df['repository'] = repo_name
    df['repository_size'] = repo_size
    df['commits_count'] = commits_count

    # Store dataframe
    repo_dataframes[repo_name] = df

Repository dpkg already exists, skipping cloning.
Repository glibc already exists, skipping cloning.
Repository systemd already exists, skipping cloning.
Cloned apt


In [None]:
print("Dataframes created for each repository:")
for repo_name, df in repo_dataframes.items():
    #print(f"\nRepository: {repo_name}")
    #print(df.head())

    # Generate the file name with the current date
    file_name = f'{repo_name}.csv'
    file_path = os.path.join(folder_path + "data/", file_name)

    # Export the data to a CSV file with the generated file name
    df.to_csv(file_path, index=False)