In [1]:
# Install dependencies
!pip install transformers datasets torch scikit-learn

# Import required libraries
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import re





In [2]:
# import pandas as pd
# import re

# # Load CSV with error skipping
# file_path = "/content/Bug_data.csv"
# df = pd.read_csv(
#     file_path,
#     quoting=3,  # QUOTE_NONE
#     quotechar='"',
#     escapechar='\\',
#     on_bad_lines='warn',
#     engine='python'  # More lenient parsing
# )

# # Fill NaNs in text columns with empty string
# df["Column1.title"] = df["Column1.title"].fillna("")
# df["Column1.body"] = df["Column1.body"].fillna("")

# # Drop rows with missing labels
# df = df.dropna(subset=["Column1.label"])

# # Define clean_text function
# def clean_text(text):
#     text = str(text).lower()
#     text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
#     text = re.sub(r"\[.?\]|\{.?\}|\(|\)", "", text)  # Remove markdown symbols
#     text = re.sub(r"[^a-z0-9\s]", "", text)  # Keep only alphanumeric & spaces
#     text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
#     return text

# # Apply cleaning to both title and body, then concatenate
# df["cleaned_text"] = df["Column1.title"].apply(clean_text) + " " + df["Column1.body"].apply(clean_text)

# # Rename label column for clarity
# df = df.rename(columns={"Column1.label": "label"})

# # Final record count
# print(f"Final number of records: {len(df)}")


# # Check raw row count
# raw_df = pd.read_csv(file_path, quoting=3, on_bad_lines='warn')
# print(f"Total rows loaded initially: {len(raw_df)}")

# # Check for expected columns
# print("Columns in dataset:", raw_df.columns.tolist())

# # Check missing values (if any)
# print("Missing values:\n", raw_df.isnull().sum())

# # Check number of rows with non-null labels
# non_null_labels = raw_df["Column1.label"].notnull().sum()
# print(f"Rows with non-null labels: {non_null_labels}")


In [3]:
input_file = "/content/Bug_data.csv"
output_file = "/content/Bug_data_cleaned_fixed.csv"

cleaned_lines = []
total_lines = 0
fixed_lines = 0
skipped_lines = 0

with open(input_file, "r", encoding="utf-8") as infile:
    for line in infile:
        total_lines += 1
        parts = [p.strip().strip('"') for p in line.strip().split(",")]

        if len(parts) == 3:
            cleaned_lines.append(",".join(parts) + "\n")
        elif len(parts) > 3:
            # Fix: Combine everything except the last field into title/body
            label = parts[-1]
            content = " ".join(parts[:-1])  # Merge the rest
            # Split content into title/body (basic heuristic: first sentence as title)
            if "." in content:
                split_index = content.find(".") + 1
                title = content[:split_index].strip()
                body = content[split_index:].strip()
            else:
                title = content
                body = ""

            cleaned_line = f'"{title}","{body}","{label}"\n'
            cleaned_lines.append(cleaned_line)
            fixed_lines += 1
        else:
            skipped_lines += 1  # Less than 3 fields, likely bad row

# Save fixed dataset
with open(output_file, "w", encoding="utf-8") as outfile:
    outfile.writelines(cleaned_lines)

print(f"Total lines processed     : {total_lines}")
print(f"Valid original lines      : {total_lines - fixed_lines - skipped_lines}")
print(f"Fixed lines (merged cols) : {fixed_lines}")
print(f"Skipped bad lines         : {skipped_lines}")
print(f"Cleaned file saved as     : {output_file}")

import pandas as pd

df = pd.read_csv("/content/Bug_data_cleaned_fixed.csv", header=None, names=["Column1.title", "Column1.body", "Column1.label"])
print(f"Records loaded after cleaning and fixing: {len(df)}")

from google.colab import files
files.download('/content/Bug_data_cleaned_fixed.csv')


Total lines processed     : 150001
Valid original lines      : 55262
Fixed lines (merged cols) : 94739
Skipped bad lines         : 0
Cleaned file saved as     : /content/Bug_data_cleaned_fixed.csv
Records loaded after cleaning and fixing: 150001


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# import os
# os.kill(os.getpid(), 9)


In [5]:
# !pip install -U --force-reinstall transformers
# import transformers
# print(transformers.__version__)

In [8]:
# Install latest dependencies
!pip install --upgrade transformers datasets evaluate scikit-learn --quiet

# Imports
import pandas as pd
import re
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, classification_report
import evaluate

# Load CSV
file_path = "/content/Bug_data_cleaned_fixed.csv"
df = pd.read_csv(file_path)

# Text cleaning
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\[.*?\]|\{.*?\}|\(|\)", "", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning
df["cleaned_text"] = df["Column1.title"].apply(clean_text) + " " + df["Column1.body"].apply(clean_text)
df = df.rename(columns={"Column1.label": "label"})

# Convert to HF dataset
dataset = Dataset.from_pandas(df[["cleaned_text", "label"]])

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Train/test split
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

# Load model
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# Metric computation function
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    # Print classification report
    print("\nClassification Report:\n")
    print(classification_report(labels, predictions, digits=4))

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./codebert_bug_classifier",
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    fp16=True  # Enable mixed precision
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train model
print(f"Training samples: {len(train_dataset)}")
print(f"Testing samples : {len(test_dataset)}")
trainer.train()

# Final evaluation
trainer.evaluate()


Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training samples: 120000
Testing samples : 30000


[34m[1mwandb[0m: Currently logged in as: [33m22071a1288[0m ([33mteamml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.4346
1000,0.3758
1500,0.3458
2000,0.3448
2500,0.3421
3000,0.3321
3500,0.3367
4000,0.3282
4500,0.3276
5000,0.3314



Classification Report:

              precision    recall  f1-score   support

           0     0.8451    0.8690    0.8569     13361
           1     0.8924    0.8720    0.8821     16639

    accuracy                         0.8707     30000
   macro avg     0.8687    0.8705    0.8695     30000
weighted avg     0.8713    0.8707    0.8709     30000



{'eval_loss': 0.32238438725471497,
 'eval_accuracy': 0.8707,
 'eval_precision': 0.8923739237392374,
 'eval_recall': 0.8720475990143638,
 'eval_f1': 0.8820936806589866,
 'eval_runtime': 103.9417,
 'eval_samples_per_second': 288.623,
 'eval_steps_per_second': 18.039,
 'epoch': 1.0}

In [None]:
import transformers
print(transformers.__version__)

In [None]:
!pip install evaluate
import numpy as np
import evaluate


# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Define function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convert logits to class labels

    # Compute accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy["accuracy"]}  # Ensure correct key

# Update Trainer with accuracy computation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Add metric calculation
)

# Evaluate again
results = trainer.evaluate()

# Print available keys for debugging
print("Available keys in results:", results.keys())

# Print accuracy if available
if "accuracy" in results:
    print(f"Model Accuracy: {results['accuracy'] * 100:.2f}%")
else:
    print("Error: Accuracy key not found in evaluation results.")


In [9]:
import numpy as np
import evaluate

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Define function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Debugging: Print structure of eval_pred
    print("Logits shape:", np.array(logits).shape)
    print("Labels shape:", np.array(labels).shape)
    print("Sample logits:", logits[:2])  # Print first two samples for inspection
    print("Sample labels:", labels[:2])

    # Convert logits to class predictions
    predictions = np.argmax(logits, axis=-1)

    # Compute accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy["accuracy"]}  # Ensure correct key

# Update Trainer with accuracy computation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Run evaluation again
results = trainer.evaluate()
print("Evaluation Results:", results)
print(f"Model Accuracy: {results['eval_accuracy'] * 100:.2f}%")

  trainer = Trainer(


Logits shape: (30000, 2)
Labels shape: (30000,)
Sample logits: [[-2.3457031  2.4335938]
 [ 1.8457031 -2.0410156]]
Sample labels: [1 0]
Evaluation Results: {'eval_loss': 0.32238438725471497, 'eval_model_preparation_time': 0.0052, 'eval_accuracy': 0.8707, 'eval_runtime': 107.6705, 'eval_samples_per_second': 278.628, 'eval_steps_per_second': 17.414}
Model Accuracy: 87.07%


In [10]:
import torch

# Move model to the same device it's trained on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define test input (title + body)
test_text = "issue type:  bug report       ansible version:  ansible 1.8.1, but present at least since 1.5.0       environment:  ubuntu 14.10, with ansible in a virtualenv  jinja2==2.7.3 markupsafe==0.23 pyyaml==3.11 ansible==1.8.1 argparse==1.2.1 ecdsa==0.11 paramiko==1.15.1 pycrypto==2.6.1 wsgiref==0.1.2       summary:  similar to gh-5914, the copy module fails when there's json with variables in  the content field, _and_ it is run with  with_items .       steps to reproduce:      yaml - hosts: localhost   tasks:   - copy:         content: \  {{item}} \          dest: /tmp/bug     with_items:     - \ 123\            expected results:  a file containing   \ 123\   .       actual results:      play  localhost                                                                   gathering facts                                                                  ok:  localhost   task:  copy                                                                      failed:  localhost  =>  item=123  => {\ failed\ : true, \ item\ : \ 123\ } msg: could not write content temp file: expected a character buffer object  fatal: all hosts have already failed -- aborting  play recap                                                                                  to retry, use: --limit @/home/username/jsonbug.retry  localhost                  : ok=1    changed=0    unreachable=0    failed=1"


# Tokenize the input and move it to the same device as the model
inputs = tokenizer(test_text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)

# Get model prediction
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)

# Extract logits and convert to class label
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()

# Print result
if predicted_class == 0:
    print("Prediction: This issue is classified as a BUG 🐞")
else:
    print("Prediction: This issue is NOT a bug ✅")


Prediction: This issue is classified as a BUG 🐞


In [11]:
from transformers import AutoTokenizer

# Define directory where to save the model
save_directory = "/content/codebert_bug_classifier_model"

# Save the trained model
trainer.save_model(save_directory)

# Save tokenizer
tokenizer.save_pretrained(save_directory)


('/content/codebert_bug_classifier_model/tokenizer_config.json',
 '/content/codebert_bug_classifier_model/special_tokens_map.json',
 '/content/codebert_bug_classifier_model/vocab.json',
 '/content/codebert_bug_classifier_model/merges.txt',
 '/content/codebert_bug_classifier_model/added_tokens.json',
 '/content/codebert_bug_classifier_model/tokenizer.json')

In [12]:
import shutil

# Create a ZIP of the folder
shutil.make_archive(save_directory, 'zip', save_directory)
from google.colab import files
files.download(save_directory + ".zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>