<a href="https://colab.research.google.com/github/sahil301290/Multiclass-BugPriority/blob/main/Multi_Target_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EFFICIENT MULTI-TARGET CLASSIFICATION FOR BUG PRIORITY AND RESOLUTION TIME PREDICTION

1. Getting preprocessed data

In [None]:
!git clone 'https://github.com/smadarab/multilabel-classification.git'
!cp /content/multilabel-classification/preprocessed_data.csv /content/preprocessed_data.csv
!rm -rf "/content/multilabel-classification"
!pip install -q scikit-multilearn

2. Calculated the Embeddings: ["BERT", "ROBERTA", "GPT1", "GPT2", "GPT2-MEDIUM", "GPT2-LARGE", "GPT2-XL"]

In [None]:
!gdown 19YPN0XAPsJjIHHVWMOxwg7C2T6zb9Qfs -O Embeddings.zip
!unzip -q Embeddings.zip -d Embeddings
!rm -rf Embeddings.zip
!ls "/content/Embeddings"

3. Dimensions of generated Embeddings

In [None]:
def print_embeddings_shapes(embeddings_dir):
    embedding_folders = ["BERT", "ROBERTA", "GPT1", "GPT2", "GPT2-MEDIUM", "GPT2-LARGE", "GPT2-XL"]

    for folder in embedding_folders:
        embedding_path = os.path.join(embeddings_dir, folder, 'embeddings.npy')
        if os.path.exists(embedding_path):
            embeddings = np.load(embedding_path)
            print(f"Shape of embeddings in {folder}: {embeddings.shape}")
        else:
            print(f"Embeddings file not found in {folder}")

embeddings_dir = "/content/Embeddings"
print_embeddings_shapes(embeddings_dir)

4. Import Libraries

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import roc_curve, precision_recall_curve, auc, confusion_matrix, classification_report, accuracy_score, matthews_corrcoef, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import logging
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

logging.basicConfig(level=logging.INFO)

5. Load Data

In [None]:
def load_data(data_path):
    try:
        data = pd.read_csv(data_path, encoding='ISO-8859-15')
        data.rename(columns={"time": "Time"}, inplace=True)
        logging.info("Data loaded successfully.")
        return data
    except FileNotFoundError:
        logging.error("File not found. Please check the path and try again.")
        return None

6. Data Preprocessing

In [None]:
def create_combined_class(data):
    data['Time_Priority'] = data['Time'].astype(str) + '_' + data['Priority'].astype(str)
    return data

def encode_combined_class(data):
    le = LabelEncoder()
    data['Time_Priority_Encoded'] = le.fit_transform(data['Time_Priority'])
    return data, le

7. Correlation Calculation and Plotting

In [None]:
def calculate_pearson_correlation(data):
    correlation_matrix = data[['Time', 'Priority']].corr()
    logging.info(f"Pearson correlation matrix:\n{correlation_matrix}")
    return correlation_matrix

def plot_correlation_heatmap(correlation_matrix, filename_base):
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title('Correlation Heatmap between Time and Priority')
    plt.tight_layout()
    plt.savefig(f"{filename_base}.png", dpi=600)
    plt.savefig(f"{filename_base}.pdf", dpi=600)
    plt.show()

8. Class Distribution Plotting

In [None]:
def plot_class_distribution(data, column, title, filename_base):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=data, x=column, palette="viridis")
    plt.title(f"Class Distribution {title}")
    plt.xlabel(f"{column} Class")
    plt.ylabel("Count")
    for p in plt.gca().patches:
        height = p.get_height()
        plt.gca().annotate(f'{height}', (p.get_x() + p.get_width() / 2., height), ha='center', va='bottom')
    plt.tight_layout()
    plt.savefig(f"{filename_base}.png", dpi=600)
    plt.savefig(f"{filename_base}.pdf", dpi=600)
    plt.show()

9. SMOTE Function

In [None]:
def apply_smote(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    logging.info("SMOTE applied.")
    logging.info(f"X_resampled shape: {X_resampled.shape}")
    logging.info(f"y_resampled shape: {y_resampled.shape}")
    return X_resampled, y_resampled

10. Split Combined Class

In [None]:
def split_combined_class(data, combined_col):
    time_priority_split = data[combined_col].str.split('_', expand=True)
    data['Time'] = time_priority_split[0].astype(int)
    data['Priority'] = time_priority_split[1].astype(int)
    logging.info("Combined class split into Time and Priority.")
    logging.info(data.head())
    return data

11. Load Embeddings

In [None]:
def load_embeddings(embedding_dir):
    return np.load(os.path.join(embedding_dir, 'embeddings.npy'))

12. Dimensionality Reduction with PCA

In [None]:
def apply_dimensionality_reduction(X_train_resampled, X_val_vect, X_test_vect, n_components):
    results_dir = "Results"
    reducer = PCA(n_components=n_components)
    X_train_pca = reducer.fit_transform(X_train_resampled)
    X_val_pca = reducer.transform(X_val_vect)
    X_test_pca = reducer.transform(X_test_vect)

    explained_variance = reducer.explained_variance_ratio_

    plt.figure(figsize=(8, 5))
    plt.plot(np.cumsum(explained_variance))
    plt.xlabel('Number of Components')
    plt.ylabel('Variance Explained')
    plt.title(f'Explained Variance by PCA Components (n_components={n_components})')
    plt.grid()
    plt.tight_layout()
    plt.savefig(os.path.join(results_dir, f'explained_variance_pca_{n_components}.png'), dpi=600)
    plt.savefig(os.path.join(results_dir, f'explained_variance_pca_{n_components}.pdf'), dpi=600)
    plt.show()

    return X_train_pca, X_val_pca, X_test_pca, explained_variance

13. Cross Validation Hyperparameters

In [None]:
def hyperparameter_tuning(model, param_grid, X, y, cv, search_type='grid'):
    if search_type == 'grid':
        search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    elif search_type == 'random':
        search = RandomizedSearchCV(model, param_grid, n_iter=50, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)

    search.fit(X, y)
    logging.info(f"Best parameters for {model.__class__.__name__}: {search.best_params_}")
    logging.info(f"Best cross-validation accuracy: {search.best_score_}")
    return search.best_estimator_

14. Model Evaluation

In [None]:
def evaluate_model(transformed_model, X_test_pca, y_test, model_name, model_results_dir):
    logging.info(f"Evaluating model: {model_name}")
    predictions = transformed_model.predict(X_test_pca).toarray()
    proba_predictions = transformed_model.predict_proba(X_test_pca).toarray() if hasattr(transformed_model, 'predict_proba') else None

    if proba_predictions is not None:
        np.save(os.path.join(model_results_dir, f'{model_name}_probabilities.npy'), proba_predictions)
        logging.info(f"Probabilities saved for {model_name}")

    y_test_combined = np.hstack((y_test['Time'].values.reshape(-1, 1), y_test['Priority'].values.reshape(-1, 1)))

    results = {}
    labels = ['Time', 'Priority']

    for i, label in enumerate(labels):
        y_true = y_test_combined[:, i]
        y_pred = predictions[:, i]
        report = save_classification_reports(y_true, y_pred, model_results_dir, label)
        accuracy = accuracy_score(y_true, y_pred)
        mcc = matthews_corrcoef(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='macro')
        recall = recall_score(y_true, y_pred, average='macro')
        f1 = f1_score(y_true, y_pred, average='macro')

        results[label] = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "mcc": mcc,
            "report": report
        }
        if proba_predictions is not None:
            y_scores = proba_predictions[:, i] if proba_predictions.ndim > 1 else proba_predictions
            plot_metrics_aggregated(y_true, y_scores, y_pred, model_name, model_results_dir, label)

        # Log the classification report
        logging.info(f"Classification Report for {model_name} - {label}:\n{pd.DataFrame(report).transpose()}")

    return results

15. Plotting

In [None]:
def plot_metrics_aggregated(y_true, y_scores, y_pred, model_name, model_results_dir, label):
    fig, axs = plt.subplots(1, 3, figsize=(18, 6))
    n_classes = len(np.unique(y_true))
    fpr, tpr, roc_auc, precision, recall, pr_auc = {}, {}, {}, {}, {}, {}

    for i in range(n_classes):
        y_true_bin = (y_true == i).astype(int)
        if y_scores.ndim == 1:
            fpr[i], tpr[i], _ = roc_curve(y_true_bin, y_scores)
            precision[i], recall[i], _ = precision_recall_curve(y_true_bin, y_scores)
        else:
            fpr[i], tpr[i], _ = roc_curve(y_true_bin, y_scores[:, 0])
            precision[i], recall[i], _ = precision_recall_curve(y_true_bin, y_scores[:, 0])

        roc_auc[i] = auc(fpr[i], tpr[i])
        pr_auc[i] = auc(recall[i], precision[i])

        axs[0].plot(fpr[i], tpr[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:.2f})')
        axs[1].plot(recall[i], precision[i], lw=2, label=f'Class {i} (area = {pr_auc[i]:.2f})')

    axs[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    axs[0].set_xlim([0.0, 1.0])
    axs[0].set_ylim([0.0, 1.05])
    axs[0].set_xlabel('False Positive Rate')
    axs[0].set_ylabel('True Positive Rate')
    axs[0].set_title('ROC', fontsize=14)
    axs[0].legend(loc="lower right")

    axs[1].set_xlabel('Recall')
    axs[1].set_ylabel('Precision')
    axs[1].set_ylim([0.0, 1.05])
    axs[1].set_xlim([0.0, 1.0])
    axs[1].set_title('Precision-Recall Curve', fontsize=14)
    axs[1].legend(loc="lower left")

    matrix = confusion_matrix(y_true, y_pred)
    cax = axs[2].matshow(matrix, cmap="Blues")
    fig.colorbar(cax, ax=axs[2])
    axs[2].set_xlabel('Predicted label')
    axs[2].set_ylabel('True label')
    axs[2].set_title(f'Confusion Matrix', fontsize=14)
    for (i, j), val in np.ndenumerate(matrix):
        axs[2].text(j, i, f'{val}', ha='center', va='center', color='black')

    plt.subplots_adjust(top=0.85)
    fig.suptitle(f'Performance Metrics for {model_name} - {label}', fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join(model_results_dir, f'{model_name}_{label}_metrics_aggregated.png'), dpi=600)
    plt.savefig(os.path.join(model_results_dir, f'{model_name}_{label}_metrics_aggregated.pdf'), dpi=600)
    plt.show()

16. Creating Results Directory and Saving Results

In [None]:
def create_results_directory(path):
    os.makedirs(path, exist_ok=True)

def save_classification_reports(y_true, y_pred, model_results_dir, label):
    report = classification_report(y_true, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df['mcc'] = matthews_corrcoef(y_true, y_pred)
    report_df.to_csv(os.path.join(model_results_dir, f'classification_report_{label}.csv'))
    return report

def save_results_to_csv(results, filename):
    results_df = pd.DataFrame(results)
    results_df.to_csv(filename, index=False)

17. Main Function:

      - Loads and preprocesses the data.
      - Splits the data into train, validation, and test sets.
      - Applies SMOTE to the training set.
      - Loads embeddings for each method and processes them.
      - Applies PCA for dimensionality reduction.
      - Performs cross-validation, hyperparameter tuning, model training, and evaluation.
      - Saves the performance results to CSV files.
      - Logs the completion of processing.


In [None]:
def main():
    results_dir = "Results"
    create_results_directory(results_dir)

    data_path = "/content/preprocessed_data.csv"
    data = load_data(data_path)

    if data is not None:
        logging.info(data.head())
        data = data[['Priority', 'Summary', 'Time']].dropna()
        logging.info(data.info())

        # Create the combined Time_Priority class
        data = create_combined_class(data)
        logging.info("Combined class created.")
        logging.info(data.head())

        # Encode the combined Time_Priority class
        data, label_encoder = encode_combined_class(data)

        # Calculate Pearson correlation between Time and Priority
        correlation_matrix = calculate_pearson_correlation(data)

        # Plot correlation heatmap
        plot_correlation_heatmap(correlation_matrix, os.path.join(results_dir, 'correlation_heatmap_time_priority'))

        # Plot the distribution of the combined Time_Priority class before SMOTE
        plot_class_distribution(data, 'Time_Priority', "Time_Priority Before SMOTE", os.path.join(results_dir, 'class_distribution_time_priority_before_smote'))

        # Prepare the dataset for SMOTE
        y = data['Time_Priority_Encoded']

        # Split the dataset into train, validation, and test sets (40:40:20)
        train_val_idx, test_idx = train_test_split(data.index, test_size=0.2, random_state=42)
        train_idx, val_idx = train_test_split(train_val_idx, test_size=0.5, random_state=42)

        X_train, X_val, X_test = data.loc[train_idx], data.loc[val_idx], data.loc[test_idx]
        y_train, y_val, y_test = y.loc[train_idx], y.loc[val_idx], y.loc[test_idx]

        logging.info(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}, X_test shape: {X_test.shape}")
        logging.info(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}, y_test shape: {y_test.shape}")

        vectorizing_methods = {
            'bert': 'BERT',
            'roberta': 'ROBERTA',
            'gpt1': 'GPT1',
            'gpt2': 'GPT2',
            'gpt2-medium': 'GPT2-MEDIUM',
            'gpt2-large': 'GPT2-LARGE',
            'gpt2-xl': 'GPT2-XL'
        }

        models = {
            "SVC": SVC(),
            "Random Forest": RandomForestClassifier(),
            "Logistic Regression": LogisticRegression(),
            "KNeighbors": KNeighborsClassifier(),
            "Decision Tree": DecisionTreeClassifier(),
            "Gradient Boosting": GradientBoostingClassifier(),
            "AdaBoost": AdaBoostClassifier(),
            "GaussianNB": GaussianNB(),
            "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        }

        param_grids = {
            'SVC': {
                'C': [0.1, 1, 10, 100],
                'kernel': ['linear', 'rbf', 'poly'],
                'gamma': ['scale', 'auto']
            },
            'Random Forest': {
                'n_estimators': [10, 50, 100, 200],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10]
            },
            'Logistic Regression': {
                'C': [0.1, 1, 10, 100],
                'max_iter': [100, 500, 1000]
            },
            'KNeighbors': {
                'n_neighbors': [3, 5, 7, 9],
                'weights': ['uniform', 'distance'],
                'algorithm': ['ball_tree', 'kd_tree', 'brute']
            },
            'Decision Tree': {
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10]
            },
            'Gradient Boosting': {
                'learning_rate': [0.01, 0.1, 0.2],
                'n_estimators': [100, 200, 300],
                'max_depth': [3, 4, 5]
            },
            'AdaBoost': {
                'learning_rate': [0.01, 0.1, 1],
                'n_estimators': [50, 100, 200]
            },
            'GaussianNB': {
                # GaussianNB does not have hyperparameters to tune.
            },
            'XGBoost': {
                'learning_rate': [0.01, 0.1, 0.2],
                'n_estimators': [100, 200, 300],
                'max_depth': [3, 4, 5]
            }
        }

        transformations = {
            "Binary Relevance": BinaryRelevance,
            "Classifier Chains": ClassifierChain,
            "Label Powerset": LabelPowerset
        }

        all_results = []

        for method, embedding_dir in vectorizing_methods.items():
            logging.info(f"Using embeddings from {method}...")
            embeddings_dir = os.path.join("Embeddings", embedding_dir)
            embeddings = load_embeddings(embeddings_dir)

            # Split the embeddings into train, validation, and test sets
            X_train_vect = embeddings[train_idx]
            X_val_vect = embeddings[val_idx]
            X_test_vect = embeddings[test_idx]

            # Apply SMOTE on the vectorized data
            X_train_resampled, y_train_resampled = apply_smote(X_train_vect, y_train)

            # Convert the resampled data back to a DataFrame for plotting
            y_train_resampled_df = pd.DataFrame(y_train_resampled, columns=['Time_Priority_Encoded'])
            y_train_resampled_df['Time_Priority'] = label_encoder.inverse_transform(y_train_resampled_df['Time_Priority_Encoded'])

            # Plot the distribution of the combined Time_Priority class after SMote
            method_results_dir = os.path.join(results_dir, method)
            create_results_directory(method_results_dir)
            plot_class_distribution(y_train_resampled_df, 'Time_Priority', f"Time_Priority After SMOTE {method}", os.path.join(method_results_dir, 'class_distribution_time_priority_after_smote'))

            # Split Time and Priority back from Time_Priority for all sets
            y_train_resampled_df = split_combined_class(y_train_resampled_df, 'Time_Priority')
            y_val_df = split_combined_class(pd.DataFrame({'Time_Priority': label_encoder.inverse_transform(y_val)}, columns=['Time_Priority']), 'Time_Priority')
            y_test_df = split_combined_class(pd.DataFrame({'Time_Priority': label_encoder.inverse_transform(y_test)}, columns=['Time_Priority']), 'Time_Priority')

            # Plot the distribution of 'Time' and 'Priority' separately after SMOTE
            plot_class_distribution(y_train_resampled_df, 'Time', f"Time After SMOTE {method}", os.path.join(method_results_dir, 'class_distribution_time_after_smote'))
            plot_class_distribution(y_train_resampled_df, 'Priority', f"Priority After SMOTE {method}", os.path.join(method_results_dir, 'class_distribution_priority_after_smote'))

            logging.info("Data after splitting combined class:")
            logging.info(y_train_resampled_df.head())
            logging.info(y_val_df.head())
            logging.info(y_test_df.head())

            # Apply dimensionality reduction with PCA with 25 components
            n_components = 25
            X_train_pca, X_val_pca, X_test_pca, explained_variances = apply_dimensionality_reduction(X_train_resampled, X_val_vect, X_test_vect, n_components)

            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            for model_name, model in models.items():
                if model_name in param_grids:
                    logging.info(f"Performing hyperparameter tuning for {model_name}...")
                    best_model = hyperparameter_tuning(model, param_grids[model_name], X_train_pca, y_train_resampled_df['Time_Priority_Encoded'], cv, search_type='grid')
                else:
                    best_model = model

                for transform_name, transform in transformations.items():
                    full_model_name = f"{transform_name} with {model_name}"
                    model_specific_results_dir = os.path.join(method_results_dir, full_model_name.replace(' ', '_'))
                    os.makedirs(model_specific_results_dir, exist_ok=True)
                    logging.info(f"Training {full_model_name} model...")
                    transformed_model = transform(best_model)
                    transformed_model.fit(X_train_pca, np.hstack((y_train_resampled_df['Time'].values.reshape(-1, 1), y_train_resampled_df['Priority'].values.reshape(-1, 1))))
                    model_results = evaluate_model(transformed_model, X_test_pca, y_test_df, full_model_name, model_specific_results_dir)
                    all_results.append({
                        "Embedding Method": method,
                        "Model": full_model_name,
                        "Results": model_results
                    })
                    logging.info(f"{full_model_name} model trained.")
                    for label in model_results:
                        logging.info(f"{full_model_name} - {label} accuracy: {model_results[label]['accuracy']}")
                        logging.info(f"Classification Report for {full_model_name} - {label}:\n{pd.DataFrame(model_results[label]['report']).transpose()}")

        # Prepare results for CSV export
        csv_results = []
        for result in all_results:
            embedding_method = result["Embedding Method"]
            model = result["Model"]
            for label, metrics in result["Results"].items():
                csv_results.append({
                    "Embedding Method": embedding_method,
                    "Model": model,
                    "Label": label,
                    "Accuracy": metrics["accuracy"],
                    "Precision": metrics["precision"],
                    "Recall": metrics["recall"],
                    "F1 Score": metrics["f1_score"],
                    "MCC": metrics["mcc"]
                })

        # Save all results to a CSV file
        results_csv_path = os.path.join(results_dir, "model_performance_summary.csv")
        save_results_to_csv(csv_results, results_csv_path)

        logging.info("Processing complete.")
    else:
        logging.error("Data loading failed.")

if __name__ == "__main__":
    main()