In [1]:
!pip install -q radon "transformers[torch]" sacrebleu pandas numpy

import pandas as pd
import numpy as np
from radon.visitors import ComplexityVisitor
from radon.metrics import mi_visit
from radon.raw import analyze
from transformers import RobertaTokenizer, RobertaModel
import torch
import sacrebleu
import warnings

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Install all required libraries
!pip install -q radon "transformers[torch]" sacrebleu pandas numpy

import pandas as pd
import numpy as np
import ast
from radon.visitors import ComplexityVisitor
from radon.metrics import mi_visit
from radon.raw import analyze
from transformers import RobertaTokenizer, RobertaModel
import torch
import sacrebleu
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

In [8]:
# Loading Dataset
df = pd.read_csv('/content/bug_fixing_files_analysis (2).csv')

# Pre-fill any potential NaN values in source code columns with empty strings
df['Source Code (before)'] = df['Source Code (before)'].fillna('')
df['Source Code (current)'] = df['Source Code (current)'].fillna('')

In [12]:
def get_radon_metrics(code):

    if not isinstance(code, str) or not code.strip():
        return np.nan, np.nan, np.nan
    try:
        # Check for valid Python 3 syntax
        ast.parse(code)
        # If syntax is ok, run radon
        loc = analyze(code).loc
        cc_visitor = ComplexityVisitor.from_code(code)
        cc = sum(block.complexity for block in cc_visitor.blocks) if cc_visitor.blocks else 0
        mi = mi_visit(code, multi=True)
        return loc, cc, mi
    except (SyntaxError, Exception):
        # If ast.parse or radon fails, return NaNs
        return np.nan, np.nan, np.nan

metrics_before = df['Source Code (before)'].apply(lambda x: pd.Series(get_radon_metrics(x)))
metrics_before.columns = ['LOC_Before', 'CC_Before', 'MI_Before']
metrics_after = df['Source Code (current)'].apply(lambda x: pd.Series(get_radon_metrics(x)))
metrics_after.columns = ['LOC_After', 'CC_After', 'MI_After']

# Combine with main dataframe
df = pd.concat([df, metrics_before, metrics_after], axis=1)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base").to(device)

def get_semantic_similarity(code1, code2):
    if not all(isinstance(c, str) and c.strip() for c in [code1, code2]):
        return np.nan
    try:
        tokens1 = tokenizer(code1, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        tokens2 = tokenizer(code2, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            embedding1 = model(**tokens1).pooler_output
            embedding2 = model(**tokens2).pooler_output
        return torch.nn.functional.cosine_similarity(embedding1, embedding2).item()
    except Exception:
        return np.nan

def get_token_similarity(code_before, code_after):
    if not all(isinstance(c, str) and c.strip() for c in [code_before, code_after]):
        return np.nan
    try:
        bleu = sacrebleu.corpus_bleu([code_after], [[code_before]], force=True)
        return bleu.score / 100.0
    except Exception:
        return np.nan

print("Calculating Semantic Similarity (CodeBERT)...")
df['Semantic_Similarity'] = df.apply(lambda row: get_semantic_similarity(row['Source Code (before)'], row['Source Code (current)']), axis=1)
print("Calculating Token Similarity (BLEU)...")
df['Token_Similarity'] = df.apply(lambda row: get_token_similarity(row['Source Code (before)'], row['Source Code (current)']), axis=1)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Calculating Semantic Similarity (CodeBERT)...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Calculating Token Similarity (BLEU)...


In [14]:
metric_cols = ['LOC_Before', 'CC_Before', 'MI_Before', 'LOC_After', 'CC_After', 'MI_After', 'Semantic_Similarity', 'Token_Similarity']
initial_rows = len(df)
df_clean = df.dropna(subset=metric_cols).copy()
final_rows = len(df_clean)
dropped_rows = initial_rows - final_rows

print(f"  - Started with {initial_rows} rows.")
print(f"  - Dropped {dropped_rows} rows due to unrecoverable parsing/processing failures.")
print(f"  - Proceeding with {final_rows} clean rows for final analysis.\n")

total_commits = df_clean['Hash'].nunique()
print(f"Total unique commits: {total_commits}")
print(f"Total files analyzed: {len(df_clean)}")
avg_files_per_commit = df_clean.groupby('Hash')['Filename'].count().mean()
print(f"Average modified files per commit: {avg_files_per_commit:.2f}\n")
print("Distribution of fix types:")
print(df_clean['LLM Inference (fix type)'].value_counts())

  - Started with 537 rows.
  - Dropped 77 rows due to unrecoverable parsing/processing failures.
  - Proceeding with 460 clean rows for final analysis.

Total unique commits: 249
Total files analyzed: 460
Average modified files per commit: 1.85

Distribution of fix types:
LLM Inference (fix type)
update scheduler.py                       27
add missing import                        21
update crontrigger.py                     16
add missing docstrings                     8
add missing docstring                      7
                                          ..
update check_callable_args.py              1
update twisted_executor.py                 1
update pool executor.py                    1
improve error message when running job     1
fix scheduler.next_wakeup_time             1
Name: count, Length: 333, dtype: int64


In [27]:
# Calculate change metrics ONLY on the clean dataframe
df_clean['LOC_Change'] = df_clean['LOC_After'] - df_clean['LOC_Before']
df_clean['CC_Change'] = df_clean['CC_After'] - df_clean['CC_Before']
df_clean['MI_Change'] = df_clean['MI_After'] - df_clean['MI_Before']

# Classification
semantic_threshold = 0.80
token_threshold = 0.75
df_clean['Semantic_Classification'] = np.where(df_clean['Semantic_Similarity'] >= semantic_threshold, 'Minor', 'Major')
df_clean['Token_Classification'] = np.where(df_clean['Token_Similarity'] >= token_threshold, 'Minor', 'Major')
df_clean["Classes_Agree"] = np.where(
    df_clean["Semantic_Classification"] == df_clean["Token_Classification"],
    "Yes",
    "No"
)
print(f'               Percent of Mismatches in Semantic and Token Similarity based Classification = {round(len(np.where(df_clean["Semantic_Classification"] != df_clean["Token_Classification"])[0]) * 100 / len(df_clean), 3)}\n\n\n')
df_clean.head()

               Percent of Mismatches in Semantic and Token Similarity based Classification = 6.304





Unnamed: 0,Hash,Message,Filename,Source Code (before),Source Code (current),Diff,LLM Inference (fix type),Rectified Message,LOC_Before,CC_Before,...,CC_After,MI_After,Semantic_Similarity,Token_Similarity,LOC_Change,CC_Change,MI_Change,Semantic_Classification,Token_Classification,Classes_Agree
0,18595f205c0f6d5b99510eba983e9437c350bee1,Miscellaneous fixes and improvements towards t...,src/apscheduler/expressions.py,"import re\nfrom calendar import weekday, month...","""""""\nThis module contains the expressions appl...","@@ -1,8 +1,13 @@\n+""""""\n+This module contains ...",add more examples to crontrigger.py,[src/apscheduler/expressions.py]: add more exa...,85.0,32.0,...,40.0,47.634007,0.998991,0.767796,28.0,8.0,0.257701,Minor,Minor,Yes
2,18595f205c0f6d5b99510eba983e9437c350bee1,Miscellaneous fixes and improvements towards t...,src/apscheduler/triggers.py,"from datetime import datetime, timedelta\nfrom...","""""""\nTriggers determine the times when a job s...","@@ -1,3 +1,6 @@\n+""""""\n+Triggers determine the...",add missing classes to crontrigger,[src/apscheduler/triggers.py]: add missing cla...,161.0,50.0,...,46.0,56.431032,0.99833,0.900892,7.0,-4.0,0.956032,Minor,Minor,Yes
3,18595f205c0f6d5b99510eba983e9437c350bee1,Miscellaneous fixes and improvements towards t...,src/apscheduler/util.py,from time import mktime\nfrom calendar import ...,"from datetime import date, datetime, timedelta...","@@ -1,13 +1,14 @@\n-from time import mktime\n+...",add missing docstrings,[src/apscheduler/util.py]: add missing docstrings,69.0,8.0,...,14.0,70.670553,0.99923,0.615186,35.0,6.0,-3.637128,Minor,Major,No
4,22ca832ff9cafac58c74da170958dc1fae974538,Corrected the parameter names,src/apscheduler/scheduler.py,"""""""\nThis module is the main part of the libra...","""""""\nThis module is the main part of the libra...","@@ -126,8 +126,8 @@ class Scheduler(object):\n...",add_cron_job decorator,[src/apscheduler/scheduler.py]: add_cron_job d...,369.0,63.0,...,63.0,54.356636,1.0,0.988278,0.0,0.0,0.0,Minor,Minor,Yes
5,4c689d37f61c2c91003fc54127c40a94f9279bbf,Fixed a corner case which caused the cron trig...,src/apscheduler/triggers.py,"""""""\nTriggers determine the times when a job s...","""""""\nTriggers determine the times when a job s...","@@ -11,7 +11,6 @@ __all__ = ('CronTrigger', 'D...",add more documentation to crontrigger,[src/apscheduler/triggers.py]: add more docume...,176.0,47.0,...,48.0,55.499082,0.999967,0.981549,-2.0,1.0,-0.364937,Minor,Minor,Yes


In [None]:
#Entire code in one step

!pip install -q radon "transformers[torch]" sacrebleu pandas numpy

import pandas as pd
import numpy as np
import ast
from radon.visitors import ComplexityVisitor
from radon.metrics import mi_visit
from radon.raw import analyze
from transformers import RobertaTokenizer, RobertaModel
import torch
import sacrebleu

df = pd.read_csv('/content/bug_fixing_files_analysis.csv')

df['Source Code (before)'] = df['Source Code (before)'].fillna('')
df['Source Code (current)'] = df['Source Code (current)'].fillna('')

def get_radon_metrics(code):
    if not isinstance(code, str) or not code.strip():
        return np.nan, np.nan, np.nan
    try:
        ast.parse(code)
        loc = analyze(code).loc
        cc_visitor = ComplexityVisitor.from_code(code)
        cc = sum(block.complexity for block in cc_visitor.blocks) if cc_visitor.blocks else 0
        mi = mi_visit(code, multi=True)
        return loc, cc, mi
    except (SyntaxError, Exception):
        return np.nan, np.nan, np.nan

metrics_before = df['Source Code (before)'].apply(lambda x: pd.Series(get_radon_metrics(x)))
metrics_before.columns = ['LOC_Before', 'CC_Before', 'MI_Before']
metrics_after = df['Source Code (current)'].apply(lambda x: pd.Series(get_radon_metrics(x)))
metrics_after.columns = ['LOC_After', 'CC_After', 'MI_After']

df = pd.concat([df, metrics_before, metrics_after], axis=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base").to(device)

def get_semantic_similarity(code1, code2):
    if not all(isinstance(c, str) and c.strip() for c in [code1, code2]):
        return np.nan
    try:
        tokens1 = tokenizer(code1, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        tokens2 = tokenizer(code2, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            embedding1 = model(**tokens1).pooler_output
            embedding2 = model(**tokens2).pooler_output
        return torch.nn.functional.cosine_similarity(embedding1, embedding2).item()
    except Exception:
        return np.nan

def get_token_similarity(code_before, code_after):
    if not all(isinstance(c, str) and c.strip() for c in [code_before, code_after]):
        return np.nan
    try:
        bleu = sacrebleu.corpus_bleu([code_after], [[code_before]], force=True)
        return bleu.score / 100.0
    except Exception:
        return np.nan

print("Calculating Semantic Similarity (CodeBERT)...")
df['Semantic_Similarity'] = df.apply(lambda row: get_semantic_similarity(row['Source Code (before)'], row['Source Code (current)']), axis=1)
print("Calculating Token Similarity (BLEU)...")
df['Token_Similarity'] = df.apply(lambda row: get_token_similarity(row['Source Code (before)'], row['Source Code (current)']), axis=1)

metric_cols = ['LOC_Before', 'CC_Before', 'MI_Before', 'LOC_After', 'CC_After', 'MI_After', 'Semantic_Similarity', 'Token_Similarity']
initial_rows = len(df)
df_clean = df.dropna(subset=metric_cols).copy()
final_rows = len(df_clean)
dropped_rows = initial_rows - final_rows

print(f"  - Started with {initial_rows} rows.")
print(f"  - Dropped {dropped_rows} rows due to unrecoverable parsing/processing failures.")
print(f"  - Proceeding with {final_rows} clean rows for final analysis.\n")

total_commits = df_clean['Hash'].nunique()
print(f"Total unique commits: {total_commits}")
print(f"Total files analyzed: {len(df_clean)}")
avg_files_per_commit = df_clean.groupby('Hash')['Filename'].count().mean()
print(f"Average modified files per commit: {avg_files_per_commit:.2f}\n")
print("Distribution of fix types:")
print(df_clean['LLM Inference (fix type)'].value_counts())

df_clean['LOC_Change'] = df_clean['LOC_After'] - df_clean['LOC_Before']
df_clean['CC_Change'] = df_clean['CC_After'] - df_clean['CC_Before']
df_clean['MI_Change'] = df_clean['MI_After'] - df_clean['MI_Before']

# Classification
semantic_threshold = 0.80
token_threshold = 0.75
df_clean['Semantic_Classification'] = np.where(df_clean['Semantic_Similarity'] >= semantic_threshold, 'Minor', 'Major')
df_clean['Token_Classification'] = np.where(df_clean['Token_Similarity'] >= token_threshold, 'Minor', 'Major')
df_clean["Classes_Agree"] = np.where(df_clean["Semantic_Classification"] == df_clean["Token_Classification"], "Yes", "No")

print(df_clean.head())