<a href="https://colab.research.google.com/github/samarth2015/CS202_A1/blob/main/llama3_commit_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install -q -U transformers bitsandbytes accelerate torch pandas

In [None]:

import os
from google.colab import userdata
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm.auto import tqdm

In [None]:

CSV_FILE_PATH = 'commit_analysis.csv'
OUTPUT_CSV_PATH = 'commits_analysis_rectified.csv'



DIFF_COLUMN = 'Diff'
ACTUAL_MSG_COLUMN = 'Message'
PREDICTED_MSG_COLUMN = 'LLM Inference (fix type)'


NEW_COLUMN_NAME = 'Rectified Message'


MODEL_ID = "google/gemma-3-1b-it"

In [None]:

hf_token = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = hf_token


print(f"{torch.cuda.get_device_name(0)}")

print(f"\nLoading model: {MODEL_ID}...")
print("This may take a few minutes...")



tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    dtype=torch.bfloat16, # Use bfloat16 for better performance on modern GPUs
    device_map="auto",
)



text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10,
    do_sample=False,
)

print("Model loaded successfully!")



✅ GPU detected: Tesla T4

Loading model: google/gemma-3-1b-it...
This may take a few minutes...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Model loaded successfully!


In [None]:
df = pd.read_csv(CSV_FILE_PATH)

def get_better_message(diff, msg_a, msg_b):
    """
    Asks the Llama model to choose the better commit message based on the diff.
    Returns the chosen message text.
    """
    # Truncate the diff if it's too long to avoid exceeding the model's context limit
    max_diff_length = 4096
    if len(str(diff)) > max_diff_length:
        diff = str(diff)[:max_diff_length] + "\n... (diff truncated)"

    # This prompt is carefully crafted to guide the model towards a clear, parsable answer.
    prompt = f"""You are an expert software developer responsible for maintaining high-quality commit histories. Your task is to analyze a code diff and determine which of two commit messages is better.

A superior commit message is clear, concise, accurate, and follows conventions like starting with an imperative verb (e.g., 'Fix', 'Add', 'Update').

**Code Diff:**
```diff
{diff}
```

**Commit Messages:**
* **Message A:** "{msg_a}"
* **Message B:** "{msg_b}"

**Task:** Which message provides a better description of the code change? Respond with only the letter 'A' or 'B' on the first line to indicate your choice.
"""

    messages = [
        {"role": "system", "content": "You are a helpful and precise code review assistant."},
        {"role": "user", "content": prompt}
    ]

    try:
        response = text_gen_pipeline(messages, return_full_text=False)[0]['generated_text']

        choice = response.strip().upper()[0]

        if choice == 'A':
            return msg_a
        elif choice == 'B':
            return msg_b
        else:
            # If the model gives an unexpected response, default to the original message
            print(f"\nWarning: Model returned unexpected choice: '{response}'. Defaulting to Message A.")
            return msg_a
    except Exception as e:
        print(f"\nAn error occurred during model inference: {e}. Defaulting to Message A.")
        return msg_a # Default to the original message on error

results = []
print(f"\nStarting verification for {len(df)} commits...")

# Set tqdm to work nicely with pandas
tqdm.pandas(desc="Verifying Commits")

# Use df.progress_apply for a clean loop with a progress bar
df[NEW_COLUMN_NAME] = df.progress_apply(
    lambda row: get_better_message(
        row[DIFF_COLUMN],
        row[ACTUAL_MSG_COLUMN],
        row[PREDICTED_MSG_COLUMN]
    ) if pd.notna(row[DIFF_COLUMN]) and pd.notna(row[ACTUAL_MSG_COLUMN]) and pd.notna(row[PREDICTED_MSG_COLUMN]) else row[ACTUAL_MSG_COLUMN],
    axis=1
)




Reading data from commit_analysis.csv...

🚀 Starting verification for 1336 commits...


Verifying Commits:   0%|          | 0/1336 [00:00<?, ?it/s]

`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True}. If this is not desired, please set these values explicitly.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:

print("\nVerification complete.")


df.to_csv(OUTPUT_CSV_PATH, index=False)

print(f"\n Success! The verified commit messages have been saved to '{OUTPUT_CSV_PATH}'.")
print("\nHere's a preview of the results:")
print(df[[ACTUAL_MSG_COLUMN, PREDICTED_MSG_COLUMN, NEW_COLUMN_NAME]].head())


Verification complete.

🎉 Success! The verified commit messages have been saved to 'commits_analysis_rectified.csv'.

Here's a preview of the results:
                                             Message  \
0                                   #1 general fixes   
1                                     #1 Minor fixes   
2                                #2 Fixes unit tests   
3  Fix expected value for the test_debian_extract...   
4  Fixed pycodestyle error and re-worded variable...   

    LLM Inference (fix type)          Rectified Message  
0       add missing comments       add missing comments  
1       add missing comments       add missing comments  
2               add test for               add test for  
3  add tests for debian-test  add tests for debian-test  
4          add more comments          add more comments  


In [None]:
# Count matches between 'Rectified Message' and 'Message' or 'LLM Inference (fix type)'
matches = df[(df['Rectified Message'] == df['LLM Inference (fix type)'])]
num_matches = len(matches)

print(f"Number of times 'Rectified Message' matches 'Message' or 'LLM Inference (fix type)': {num_matches}")

Number of times 'Rectified Message' matches 'Message' or 'LLM Inference (fix type)': 1319
