In [None]:

from pydriller import Repository
import csv

In [None]:


data = []
def is_bug_fix(commit):
    # A simple heuristic: check if the commit message contains 'fix' or 'bug'
    return 'fix' in commit.msg.lower() or 'bug' in commit.msg.lower()


def process_commit(commit):
    if is_bug_fix(commit):
        commit_data = {
            'Hash': commit.hash,
            'Message': commit.msg,
            'Parent Hashes': commit.parents,
            'Is Merge Commit': str(commit.merge),
            'Modified Files': ', '.join([file.filename for file in commit.modified_files])
        }
        data.append(commit_data)

for commit in Repository('https://github.com/aboutcode-org/vulnerablecode').traverse_commits():
    process_commit(commit)

In [13]:
# Write the data to a CSV file
with open('bug_fixes.csv', 'w', newline='') as csvfile:
    fieldnames = ['Hash', 'Message', 'Parent Hashes', 'Is Merge Commit', 'Modified Files']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in data:
        writer.writerow(row)

        

In [14]:
len(data)

461

In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")




  from .autonotebook import tqdm as notebook_tqdm


In [None]:

def predict_commit_message(diff):
    inputs = tokenizer(diff, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def process(commit, writer):
    if is_bug_fix(commit):
        for modified_file in commit.modified_files:
            diff = modified_file.diff
            if len(diff) > 100000:
                return
            llm_inference = predict_commit_message(diff)
            
            file_data = {
                'Hash': commit.hash,
                'Message': commit.msg,
                'Filename': modified_file.filename,
                'Source Code (before)': modified_file.source_code_before,
                'Source Code (current)': modified_file.source_code,
                'Diff': diff,
                'LLM Inference (fix type)': llm_inference
            }
            writer.writerow(file_data)
           


In [None]:
output_csv_file = 'commit_analysis.csv'
csv_headers = [
    'Hash', 'Message', 'Filename', 'Source Code (before)', 'Source Code (current)', 'Diff', 'LLM Inference (fix type)'
]

with open(output_csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=csv_headers)
    
    writer.writeheader()
    
    print(f"Starting analysis. Writing data to {output_csv_file}...")
    
    for commit in Repository('https://github.com/aboutcode-org/vulnerablecode').traverse_commits():
        process(commit, writer)

print("Analysis complete.")

Starting analysis. Writing data to commit_analysis.csv...


In [None]:
count = 0
with open('commit_analysis.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        count+=1  # Print each row to verify the output

print(f"Total number of commits processed: {count}")