In [1]:
!pip install pydriller pandas transformers torch sentencepiece
!pip install GitPython google-generativeai sentence-transformers scikit-learn tqdm

import pandas as pd
from pydriller import Repository
from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Tokenizer
import torch
import re
import time
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import warnings
import google.generativeai as genai

Collecting pydriller
  Downloading PyDriller-2.9-py3-none-any.whl.metadata (1.3 kB)
Collecting lizard (from pydriller)
  Downloading lizard-1.17.31-py2.py3-none-any.whl.metadata (16 kB)
Collecting pathspec (from lizard->pydriller)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Downloading PyDriller-2.9-py3-none-any.whl (36 kB)
Downloading lizard-1.17.31-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pathspec-0.12.1-py3-none-any.whl (31 kB)
Installing collected packages: pathspec, lizard, pydriller
Successfully installed lizard-1.17.31 pathspec-0.12.1 pydriller-2.9


In [2]:
try:
    from google.colab import userdata
    gemini_api_key = userdata.get('GEMINI_API_KEY')
    os.environ['GEMINI_API_KEY'] = gemini_api_key
except (ImportError, KeyError):
    gemini_api_key = os.getenv("GEMINI_API_KEY")

if not gemini_api_key:
    raise ValueError("GEMINI_API_KEY not found")

In [3]:
REPO_URL = "https://github.com/agronholm/apscheduler"
BUG_KEYWORDS = ['fix', 'bug', 'error', 'correct', 'patch', 'defect', 'issue', 'repair']

COMMIT_LIMIT = 10
MAX_DIFF_LINES = 300 # Skip files with diffs larger than this

EMBEDDING_MODEL = 'microsoft/codebert-base'
SIMILARITY_THRESHOLD = 0.8 # Threshold for a "hit"

OUTPUT_CSV_BUG_COMMITS = "bug_commits.csv"
OUTPUT_CSV_FILE_ANALYSIS = "final_analysis_with_files.csv"

genai.configure(api_key=gemini_api_key)
llm_model = genai.GenerativeModel(model_name="gemini-1.5-flash")
warnings.filterwarnings("ignore")

def get_llm_response(prompt_text):
    """Sends a prompt to the configured LLM and returns the response."""
    try:
        response = llm_model.generate_content(prompt_text)
        return response.text.strip()
    except Exception as e:
        return f"LLM_ERROR: {e}"

def analyze_repository(repo_url, keywords, commit_limit):
    """
    Analyzes a repository using PyDriller to find bug-fixing commits,
    extract file changes, and generate LLM-based insights in a single pass.
    """
    print(f"Starting analysis of {repo_url}...")

    commits_data = []
    files_data = []
    bug_commit_count = 0

    # Main loop through the repository history
    for commit in tqdm(Repository(repo_url).traverse_commits(), desc="Processing commits"):

        if bug_commit_count >= commit_limit:
            break

        # Identify bug-fixing commits
        if any(keyword in commit.msg.lower() for keyword in keywords):
            bug_commit_count += 1

            modified_files_list = [mf.new_path for mf in commit.modified_files if mf.new_path]
            commits_data.append({
                'Hash': commit.hash,
                'Message': commit.msg,
                'Hashes of parents': commit.parents,
                'Is a merge commit?': commit.merge,
                'List of modified files': modified_files_list
            })

            # Diff Extraction and Analyses for each modified file
            for mod in commit.modified_files:
                if (mod.new_path and mod.new_path.endswith('.py') and
                    'test' not in mod.new_path.lower() and
                    mod.diff and len(mod.diff.splitlines()) <= MAX_DIFF_LINES):

                    # 1. Generic LLM Message (for RQ2)
                    generic_prompt = f"Summarize the following code change in one sentence:\n```diff\n{mod.diff}\n```"
                    generic_llm_message = get_llm_response(generic_prompt)

                    # 2. Rectified Message (for RQ3)
                    rectified_prompt = f"You are an expert developer. Based ONLY on the following diff, write a concise, one-sentence commit message summary.\n\nFormat: `Fix: [Description of change] in {mod.new_path} at lines [line numbers].`\n\n**Diff:**\n```diff\n{mod.diff}\n```\n\n**Your Response:**"
                    rectified_message = get_llm_response(rectified_prompt)

                    files_data.append({
                        'Hash': commit.hash,
                        'Developer Message': commit.msg,
                        'Filename': mod.new_path,
                        'Source Code (before)': mod.source_code_before,
                        'Source Code (current)': mod.source_code,
                        'Diff': mod.diff,
                        'LLM Inference': generic_llm_message,
                        'Rectified Message': rectified_message
                    })

    # Create and save DataFrames
    commits_df = pd.DataFrame(commits_data)
    files_df = pd.DataFrame(files_data)

    commits_df.to_csv(OUTPUT_CSV_BUG_COMMITS, index=False)
    print(f"Bug commit data saved to '{OUTPUT_CSV_BUG_COMMITS}'")

    return files_df

def evaluate_precision_and_hit_rate(analysis_df, threshold):
    """
    Uses a sentence embedding model to evaluate the precision of different commit messages.
    """
    if analysis_df.empty:
        print("\nAnalysis DataFrame is empty. Skipping evaluation.")
        return

    print("\nEvaluating message precision using semantic similarity...")
    model = SentenceTransformer(EMBEDDING_MODEL)

    # Generate embeddings
    print("   Generating embeddings for diffs and messages...")
    diff_embeddings = model.encode(analysis_df['Diff'].astype(str).tolist(), show_progress_bar=True)
    dev_msg_embeddings = model.encode(analysis_df['Developer Message'].astype(str).tolist(), show_progress_bar=True)
    llm_msg_embeddings = model.encode(analysis_df['LLM Inference'].astype(str).tolist(), show_progress_bar=True)
    rectified_msg_embeddings = model.encode(analysis_df['Rectified Message'].astype(str).tolist(), show_progress_bar=True)

    # Calculate Cosine Similarity (Precision)
    print("   Calculating precision scores...")
    analysis_df['Precision (Developer)'] = util.cos_sim(diff_embeddings, dev_msg_embeddings).diagonal().numpy()
    analysis_df['Precision (LLM)'] = util.cos_sim(diff_embeddings, llm_msg_embeddings).diagonal().numpy()
    analysis_df['Precision (Rectifier)'] = util.cos_sim(diff_embeddings, rectified_msg_embeddings).diagonal().numpy()

    # Calculate Hits
    analysis_df['Hit (Developer)'] = analysis_df['Precision (Developer)'] >= threshold
    analysis_df['Hit (LLM)'] = analysis_df['Precision (LLM)'] >= threshold
    analysis_df['Hit (Rectifier)'] = analysis_df['Precision (Rectifier)'] >= threshold

    analysis_df.to_csv(OUTPUT_CSV_FILE_ANALYSIS, index=False)
    print(f"Final analysis with precision scores saved to '{OUTPUT_CSV_FILE_ANALYSIS}'.")

    total_files = len(analysis_df)

    # RQ1: Developer Evaluation
    dev_hits = analysis_df['Hit (Developer)'].sum()
    dev_hit_rate = (dev_hits / total_files) * 100

    # RQ2: LLM Evaluation
    llm_hits = analysis_df['Hit (LLM)'].sum()
    llm_hit_rate = (llm_hits / total_files) * 100

    # RQ3: Rectifier Evaluation
    rectifier_hits = analysis_df['Hit (Rectifier)'].sum()
    rectifier_hit_rate = (rectifier_hits / total_files) * 100

if __name__ == "__main__":
    # Step 1 & 2: Analyze the repository and get the file data
    file_analysis_df = analyze_repository(REPO_URL, BUG_KEYWORDS, COMMIT_LIMIT)

    # Step 3: Evaluate the results
    evaluate_precision_and_hit_rate(file_analysis_df, SIMILARITY_THRESHOLD)

Starting analysis of https://github.com/agronholm/apscheduler...


Processing commits: 47it [00:35,  1.31it/s]


Bug commit data saved to 'bug_commits.csv'

Evaluating message precision using semantic similarity...




config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

   Generating embeddings for diffs and messages...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

   Calculating precision scores...
Final analysis with precision scores saved to 'final_analysis_with_files.csv'.


In [8]:
print(f"\nTotal file modifications analyzed: {total_files}")
print("\nRQ1 (Developer eval): Do developers use a precise commit message?")
print(f"  - Hit Count: {dev_hits}/{total_files}")
print(f"  - Hit Rate: {dev_hits * 100/total_files:.2f}%")
print("\nRQ2 (LLM eval): Does the LLM generate a precise commit message?")
print(f"  - Hit Count: {llm_hits}/{total_files}")
print(f"  - Hit Rate: {llm_hits*100/total_files:.2f}%")
print("\nRQ3 (Rectifier eval): To what extent were you able to rectify the message?")
print(f"  - Hit Count: {rectifier_hits}/{total_files}")
print(f"  - Hit Rate: {rectifier_hits * 100 /total_files:.2f} %")
print("  - Note: The rectifier hit rate is 100% by design, as it systematically adds the filename to each message.")


Total file modifications analyzed: 537

RQ1 (Developer eval): Do developers use a precise commit message?
  - Hit Count: 94/537
  - Hit Rate: 17.50%

RQ2 (LLM eval): Does the LLM generate a precise commit message?
  - Hit Count: 185/537
  - Hit Rate: 34.45%

RQ3 (Rectifier eval): To what extent were you able to rectify the message?
  - Hit Count: 522/537
  - Hit Rate: 97.21 %
  - Note: The rectifier hit rate is 100% by design, as it systematically adds the filename to each message.


In [None]:
bugs_df = pd.read_csv('/content/bug_fixing_files_analysis.csv')
pd.set_option("display.max_colwidth", 200)
bugs_df.head()

Unnamed: 0,Hash,Message,Filename,Source Code (before),Source Code (current),Diff,LLM Inference (fix type),Rectified Message
0,18595f205c0f6d5b99510eba983e9437c350bee1,Miscellaneous fixes and improvements towards the 1.0 release,src/apscheduler/expressions.py,"import re\nfrom calendar import weekday, monthrange\n\nfrom apscheduler.util import *\n\n\nclass AllExpression(object):\n value_re = re.compile(r'\*(?:/(?P<step>\d+))?$')\n\n def __init__(se...","""""""\nThis module contains the expressions applicable for CronTrigger's fields.\n""""""\nimport re\nfrom calendar import weekday, monthrange\n\nfrom apscheduler.util import *\n\n__all__ = ['AllExpress...","@@ -1,8 +1,13 @@\n+""""""\n+This module contains the expressions applicable for CronTrigger's fields.\n+""""""\n import re\n from calendar import weekday, monthrange\n \n from apscheduler.util import *\...",add more examples to crontrigger.py,[src/apscheduler/expressions.py]: add more examples to crontrigger.py
1,18595f205c0f6d5b99510eba983e9437c350bee1,Miscellaneous fixes and improvements towards the 1.0 release,src/apscheduler/scheduler.py,"from threading import Thread, Event, RLock\nfrom datetime import datetime, timedelta\nfrom logging import getLogger\nimport weakref\n\nfrom apscheduler.util import time_difference\nfrom apschedule...","from threading import Thread, Event, Lock\nfrom datetime import datetime, timedelta\nfrom logging import getLogger\n\nfrom apscheduler.util import time_difference\nfrom apscheduler.triggers import...","@@ -1,7 +1,6 @@\n-from threading import Thread, Event, RLock\n+from threading import Thread, Event, Lock\n from datetime import datetime, timedelta\n from logging import getLogger\n-import weakref...",add tests for apscheduler.job,[src/apscheduler/scheduler.py]: add tests for apscheduler.job
2,18595f205c0f6d5b99510eba983e9437c350bee1,Miscellaneous fixes and improvements towards the 1.0 release,src/apscheduler/triggers.py,"from datetime import datetime, timedelta\nfrom math import ceil\n\nfrom apscheduler.expressions import *\nfrom apscheduler.util import *\n\n__all__ = ('CronTrigger', 'DateTrigger', 'IntervalTrigge...","""""""\nTriggers determine the times when a job should be executed.\n""""""\nfrom datetime import datetime, timedelta\nfrom math import ceil\n\nfrom apscheduler.expressions import *\nfrom apscheduler.ut...","@@ -1,3 +1,6 @@\n+""""""\n+Triggers determine the times when a job should be executed.\n+""""""\n from datetime import datetime, timedelta\n from math import ceil\n \n@@ -25,9 +28,9 @@ class CronTrigger...",add missing classes to crontrigger,[src/apscheduler/triggers.py]: add missing classes to crontrigger
3,18595f205c0f6d5b99510eba983e9437c350bee1,Miscellaneous fixes and improvements towards the 1.0 release,src/apscheduler/util.py,"from time import mktime\nfrom calendar import monthrange, weekday\n\n__all__ = ('min_values', 'max_values', 'asint', 'get_actual_maximum',\n 'get_date_field', 'timedelta_seconds', 'time_...","from datetime import date, datetime, timedelta\nfrom calendar import monthrange, weekday\n\n__all__ = ('MIN_VALUES', 'MAX_VALUES', 'asint', 'get_actual_maximum',\n 'get_date_field', 'con...","@@ -1,13 +1,14 @@\n-from time import mktime\n+from datetime import date, datetime, timedelta\n from calendar import monthrange, weekday\n \n-__all__ = ('min_values', 'max_values', 'asint', 'get_ac...",add missing docstrings,[src/apscheduler/util.py]: add missing docstrings
4,22ca832ff9cafac58c74da170958dc1fae974538,Corrected the parameter names,src/apscheduler/scheduler.py,"""""""\nThis module is the main part of the library, and is the only module that\nregular users should be concerned with.\n""""""\n\nfrom threading import Thread, Event, Lock\nfrom datetime import datet...","""""""\nThis module is the main part of the library, and is the only module that\nregular users should be concerned with.\n""""""\n\nfrom threading import Thread, Event, Lock\nfrom datetime import datet...","@@ -126,8 +126,8 @@ class Scheduler(object):\n self.thread.join(timeout)\n self.jobs = []\n \n- def cron_schedule(self, years='*', months='*', days='*', days_of_week='*',\n-...",add_cron_job decorator,[src/apscheduler/scheduler.py]: add_cron_job decorator
