In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import keras_tuner as kt  # Import Keras Tuner

# Specify the path to the pickle files
file_path = 'C://Users//rishi//Downloads//pytracebugs_dataset_v1//buggy_dataset//bugfixes_train.pickle'
file_path1 = 'C://Users//rishi//Downloads//pytracebugs_dataset_v1//buggy_dataset//bugfixes_test.pickle'
file_path2 = 'C://Users//rishi//Downloads//pytracebugs_dataset_v1//buggy_dataset//bugfixes_valid.pickle'

# Load the datasets
data = pd.read_pickle(file_path)
data1 = pd.read_pickle(file_path1)
data2 = pd.read_pickle(file_path2)

# Combine the datasets into one dataframe
df = pd.concat([data, data2, data1], ignore_index=True)

# Remove error types with fewer than 3 occurrences
error_counts = df['traceback_type'].value_counts()
error_types_to_remove = error_counts[error_counts < 10].index  # Remove traceback types with less than 3 occurrences
df = df[~df['traceback_type'].isin(error_types_to_remove)]

# Prepare features and labels
X = df['before_merge']  # Features
y = df['traceback_type']  # Labels

# Stratified split to ensure each traceback_type is represented in all sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)  # 80% training, stratified split
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)  # 10% validation, 10% testing

# Vectorization: Convert text data to numerical vectors using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)  # Limit to top 10000 features for efficiency
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

# Convert sparse matrices to dense format to avoid sparse indexing issue
X_train_vec_dense = X_train_vec.toarray()
X_val_vec_dense = X_val_vec.toarray()
X_test_vec_dense = X_test_vec.toarray()

# Convert labels to numerical values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

# Function to create model for KerasTuner
def build_model(hp):
    model = Sequential()
    
    # Hyperparameter tuning for number of neurons in each layer
    model.add(Dense(units=hp.Int('units_1', min_value=128, max_value=512, step=128), input_dim=X_train_vec_dense.shape[1], activation='relu'))
    model.add(Dropout(hp.Float('dropout_1', min_value=0.4, max_value=0.7, step=0.1)))  # Hyperparameter for dropout
    model.add(Dense(units=hp.Int('units_2', min_value=64, max_value=256, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dropout_2', min_value=0.4, max_value=0.7, step=0.1)))  # Hyperparameter for dropout
    model.add(Dense(units=hp.Int('units_3', min_value=32, max_value=128, step=32), activation='relu'))
    model.add(Dropout(hp.Float('dropout_3', min_value=0.4, max_value=0.7, step=0.1)))  # Hyperparameter for dropout
    
    # Output layer
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))
    
    # Hyperparameter tuning for learning rate of Adam optimizer
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-3, sampling='LOG')), 
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Initialize KerasTuner
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='kt_dir',
                     project_name='bug_traceback')

# Perform the hyperparameter search
tuner.search(X_train_vec_dense, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_val_vec_dense, y_val_encoded))

# Get the best model and hyperparameters
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best hyperparameters found: ", best_hyperparameters)

# Train the best model
history = best_model.fit(X_train_vec_dense, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_val_vec_dense, y_val_encoded))

# Evaluate the best model on the test set
y_pred_encoded = best_model.predict(X_test_vec_dense)
y_pred = label_encoder.inverse_transform(y_pred_encoded.argmax(axis=1))  # Convert predictions back to original labels

# Evaluation
print(classification_report(y_test, y_pred))


Trial 30 Complete [00h 03m 41s]
val_accuracy: 0.3345487713813782

Best val_accuracy So Far: 0.4206928014755249
Total elapsed time: 00h 29m 23s
Best hyperparameters found:  <keras_tuner.src.engine.hyperparameters.hyperparameters.HyperParameters object at 0x000002A839A9A950>
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
                                                              precision    recall  f1-score   support

                                  ArtifactNotRegisteredError       0.00      0.00      0.00         1
                                              AssertionError       0.37      0.30      0.33        92
                                              AttributeError       0.51      0.52      0.51       401
                                             BrokenPipeError       0.50      0.40      0.44         5
                                          CalledProcessError       0.00      0.00      0.00         1
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))