In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load your dataset
data = pd.read_csv("combined_dataset.csv")

# Initialize BERT tokenizer and model, moving the model to the GPU
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to generate embeddings using BERT on GPU
def get_embeddings(text):
    # Tokenize the input text and move tensors to GPU
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    outputs = model(**inputs)
    # Take the average of the token embeddings and move the result back to CPU
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).detach().cpu().numpy()
    return embeddings[0]

# Generate embeddings for the entire dataset
data['embeddings'] = data['input_text'].apply(get_embeddings)

# Encode the target labels and combine them into a single column
combined_target = data['priority_binned'] + "_" + data['bug_resolution_time']

# Encode combined target using LabelEncoder
le_combined = LabelEncoder()
combined_target_encoded = le_combined.fit_transform(combined_target)

# Prepare the embeddings as features
X = np.stack(data['embeddings'].values)

# Apply SMOTE to the combined target
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, combined_target_encoded)

# Decode the combined target back into separate priority and resolution time columns
combined_res = le_combined.inverse_transform(y_res)
priority_res, resolution_res = zip(*[item.split("_") for item in combined_res])

# Create a DataFrame for synthetic samples
synthetic_data = pd.DataFrame(X_res, columns=[f'embedding_{i}' for i in range(X_res.shape[1])])
synthetic_data['priority_binned'] = priority_res
synthetic_data['bug_resolution_time'] = resolution_res






In [11]:
synthetic_data.columns

Index(['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3',
       'embedding_4', 'embedding_5', 'embedding_6', 'embedding_7',
       'embedding_8', 'embedding_9',
       ...
       'embedding_760', 'embedding_761', 'embedding_762', 'embedding_763',
       'embedding_764', 'embedding_765', 'embedding_766', 'embedding_767',
       'priority_binned', 'bug_resolution_time'],
      dtype='object', length=770)