In [1]:
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
import numpy as np
from torch.cuda.amp import autocast, GradScaler
from imblearn.combine import SMOTEENN 


In [2]:

# Load your dataset
file_path = 'F:\\finetunining sample\\combined_dataset.csv'
dataset = pd.read_csv(file_path)
# dataset = dataset.iloc[:10000]

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing steps
dataset['input_text'] = dataset['input_text'].str.lower()
dataset['input_text'] = dataset['input_text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s.,!?]', '', x))
dataset['input_text_tokens'] = dataset['input_text'].apply(lambda x: tokenizer.tokenize(x))
max_length = 128

dataset['input_text_padded'] = dataset['input_text_tokens'].apply(
    lambda x: tokenizer.convert_tokens_to_ids(x)[:max_length] + [0] * (max_length - len(x))
)




In [3]:

# Encode the target variables
label_encoder_priority = LabelEncoder()
label_encoder_resolution = LabelEncoder()

dataset['priority_encoded'] = label_encoder_priority.fit_transform(dataset['priority_binned'])
dataset['resolution_encoded'] = label_encoder_resolution.fit_transform(dataset['bug_resolution_time'])

In [5]:
dataset['priority_encoded'].value_counts()

priority_encoded
1    10912
2     9405
0     2335
Name: count, dtype: int64

In [8]:
dataset['resolution_encoded'].value_counts()

resolution_encoded
2    1984
0    1653
1    1033
Name: count, dtype: int64

In [70]:

# Encode the target variables
label_encoder_priority = LabelEncoder()
label_encoder_resolution = LabelEncoder()

dataset['priority_encoded'] = label_encoder_priority.fit_transform(dataset['priority_binned'])
dataset['resolution_encoded'] = label_encoder_resolution.fit_transform(dataset['bug_resolution_time'])

# Split dataset into train, validation, and test sets
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset[['priority_encoded', 'resolution_encoded']])
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df[['priority_encoded', 'resolution_encoded']])

# Convert columns to tensors
def convert_to_tensor_with_mask(df, feature_col, target_col):
    features = torch.tensor(df[feature_col].tolist(), dtype=torch.long).cuda()
    attention_mask = (features != 0).long().cuda()
    targets = torch.tensor(df[target_col].tolist(), dtype=torch.long).cuda()
    return features, attention_mask, targets

train_features, train_attention_mask, train_priority = convert_to_tensor_with_mask(train_df, 'input_text_padded', 'priority_encoded')
train_features, train_attention_mask, train_resolution = convert_to_tensor_with_mask(train_df, 'input_text_padded', 'resolution_encoded')

val_features, val_attention_mask, val_priority = convert_to_tensor_with_mask(val_df, 'input_text_padded', 'priority_encoded')
val_features, val_attention_mask, val_resolution = convert_to_tensor_with_mask(val_df, 'input_text_padded', 'resolution_encoded')


In [71]:

# SMOTE for class imbalance
X_train = np.array(train_df['input_text_padded'].tolist())
y_train_priority = np.array(train_df['priority_encoded'])
y_train_resolution = np.array(train_df['resolution_encoded'])

print(f"Original X_train size: {X_train.shape}")
print(f"Original y_train_priority size: {y_train_priority.shape}")
print(f"Original y_train_resolution size: {y_train_resolution.shape}")
# Apply SMOTE to each target separately
smote_priority = SMOTE(random_state=42)
X_train_smote, y_train_priority_smote = smote_priority.fit_resample(X_train, y_train_priority)

smote_resolution = SMOTE(random_state=42)
X_train_smote, y_train_resolution_smote = smote_resolution.fit_resample(X_train, y_train_resolution)


Original X_train size: (18121, 128)
Original y_train_priority size: (18121,)
Original y_train_resolution size: (18121,)


In [73]:
pd.Series(y_train_priority_smote).value_counts()

1    8730
2    8730
0    8730
Name: count, dtype: int64

In [74]:
pd.Series(y_train_resolution_smote).value_counts()

1    7294
0    7294
2    7294
Name: count, dtype: int64

In [75]:

val = pd.Series(y_train_priority_smote)
index_vals = list(val[val==1].index[:pd.Series(y_train_resolution_smote).value_counts()[0]]) + list(val[val==0].index[:pd.Series(y_train_resolution_smote).value_counts()[0]]) + list(val[val==2].index[:pd.Series(y_train_resolution_smote).value_counts()[0]])
val = val.iloc[index_vals]
y_train_priority_smote = np.array(val)

print(f"SMOTE X_train_smote size: {X_train_smote.shape}")
print(f"SMOTE y_train_priority_smote size: {y_train_priority_smote.shape}")
print(f"SMOTE y_train_resolution_smote size: {y_train_resolution_smote.shape}")

SMOTE X_train_smote size: (21882, 128)
SMOTE y_train_priority_smote size: (21882,)
SMOTE y_train_resolution_smote size: (21882,)
