In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB  # Importing MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.model_selection import StratifiedKFold
from torch import nn
import torch.nn.functional as F

# Define your GCN model (placeholder)
class GCNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNModel, self).__init__()
        self.gc = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.gc(x)
        x = self.relu(x)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Define your RNN model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Convert input to float if it's LongTensor (assuming it should be FloatTensor)
        if x.dtype != torch.float32:
            x = x.to(torch.float32)
        
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last hidden state
        return F.log_softmax(out, dim=1)

def load_and_preprocess_data(data_path):
    bug_reports = pd.read_csv(data_path)

    bug_reports['creation_date'] = pd.to_datetime(bug_reports['creation_date'], errors='coerce')
    bug_reports['resolution_date'] = pd.to_datetime(bug_reports['resolution_date'])
    bug_reports['bug_fix_time'] = (bug_reports['resolution_date'] - bug_reports['creation_date']).dt.days
    bug_reports = bug_reports.dropna(subset=['bug_fix_time', 'short_description'])

    threshold = 10
    bug_reports['bug_class'] = np.where(bug_reports['bug_fix_time'] <= threshold, 'short-lived', 'long-lived')

    return bug_reports

def extract_tfidf_features(bug_reports):
    tfidf_vectorizer = TfidfVectorizer(max_features=128, stop_words='english')
    tfidf_features = tfidf_vectorizer.fit_transform(bug_reports['short_description']).toarray()

    return tfidf_features

def extract_distilbert_features(bug_reports):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')

    def extract_distilbert_feature(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    distilbert_features = np.array(bug_reports['short_description'].apply(extract_distilbert_feature).tolist())
    distilbert_features = torch.tensor(distilbert_features)

    return distilbert_features

def extract_gcn_features(bug_reports):
    input_dim = 256  # Placeholder for input dimension
    gcn_features = torch.randn(len(bug_reports), input_dim)  # Dummy feature vector
    
    return gcn_features

def extract_rnn_features(bug_reports):
    # Example function to extract RNN features (assuming `padded_texts` is available)
    # Modify according to your actual data structure
    padded_texts = torch.randn(len(bug_reports), 10, 50)  # Example padded_texts, adjust dimensions

    # Initialize RNN model
    rnn_model = RNNModel(input_dim=padded_texts.shape[-1], hidden_dim=128, output_dim=2)

    with torch.no_grad():
        rnn_features = rnn_model(padded_texts)

    return rnn_features

def train_and_evaluate_classifier(X_train, X_test, y_train, y_test, classifier, feature_type):
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)

    cm = confusion_matrix(y_test, predictions)
    balanced_accuracy = np.mean([cm[i, i] / np.sum(cm[i]) for i in range(len(np.unique(y_test)))])
    
    acc_score = accuracy_score(y_test, predictions)
    class_report = classification_report(y_test, predictions, output_dict=True)
    
    # Extract precision, recall, f1-score for each class from classification report
    precision = class_report['weighted avg']['precision']
    recall = class_report['weighted avg']['recall']
    f1_score = class_report['weighted avg']['f1-score']
    
    return {
        'Classifier': classifier.__class__.__name__,
        'Feature Type': feature_type,
        'Accuracy': acc_score,
        'Balanced Accuracy': balanced_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1_score
    }

def plot_bar_chart(accuracies_tfidf, accuracies_distilbert, accuracies_gcn, accuracies_rnn, classifiers):
    datasets = ['Mozilla', 'Eclipse']
    bar_width = 0.2
    opacity = 0.8

    for idx, dataset in enumerate(datasets):
        plt.figure(figsize=(14, 6))
        index = np.arange(len(classifiers))

        for i, classifier in enumerate(classifiers):
            tfidf_val = accuracies_tfidf[idx][i]
            distilbert_val = accuracies_distilbert[idx][i]
            gcn_val = accuracies_gcn[idx][i]
            rnn_val = accuracies_rnn[idx][i]

            plt.bar(index[i], tfidf_val, bar_width,
                    alpha=opacity,
                    color='skyblue',
                    label=f'{classifier} TF-IDF')

            plt.bar(index[i] + bar_width, distilbert_val, bar_width,
                    alpha=opacity,
                    color='lightcoral',
                    label=f'{classifier} DistilBERT')

            plt.bar(index[i] + 2 * bar_width, gcn_val, bar_width,
                    alpha=opacity,
                    color='lightgreen',
                    label=f'{classifier} GCN')

            plt.bar(index[i] + 3 * bar_width, rnn_val, bar_width,
                    alpha=opacity,
                    color='gold',
                    label=f'{classifier} RNN')

            plt.text(index[i], tfidf_val + 0.02, f'{tfidf_val:.2f}', ha='center', va='bottom')
            plt.text(index[i] + bar_width, distilbert_val + 0.02, f'{distilbert_val:.2f}', ha='center', va='bottom')
            plt.text(index[i] + 2 * bar_width, gcn_val + 0.02, f'{gcn_val:.2f}', ha='center', va='bottom')
            plt.text(index[i] + 3 * bar_width, rnn_val + 0.02, f'{rnn_val:.2f}', ha='center', va='bottom')

        plt.xlabel('Classifiers')
        plt.ylabel('Balanced Accuracy')
        plt.title(f'Balanced Accuracy Comparison for {dataset} Dataset')
        plt.xticks(index + 1.5 * bar_width, classifiers)
        plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
        plt.tight_layout()

        plt.show()

def create_summary_table(results):
    # Initialize an empty list to collect data
    summary_data = []

    # Iterate over results and append to summary_data list
    for result in results:
        classifier = result['Classifier']
        feature_type = result['Feature Type']
        acc_score = result['Accuracy']
        balanced_acc = result['Balanced Accuracy']
        precision = result['Precision']
        recall = result['Recall']
        f1_score = result['F1-score']

        summary_data.append({
            'Classifier': classifier,
            'Feature Type': feature_type,
            'Accuracy': acc_score,
            'Balanced Accuracy': balanced_acc,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1_score
        })

    # Create DataFrame from summary_data list
    df = pd.DataFrame(summary_data)

    return df


# Load and preprocess data for different projects
data_paths = [
    r'C:\Users\hp\Desktop\New folder (2)\mozilla_bug_report_data.csv',
    r'C:\Users\hp\Desktop\New folder (2)\eclipse_bug_report_data.csv',
]

classifiers = ['KNN', 'NB', 'NN', 'RF', 'SVM']

# Initialize empty lists to collect results
results = []

for data_path in data_paths:
    print(f'Processing data from: {data_path}')
    bug_reports = load_and_preprocess_data(data_path)

    X_tfidf = extract_tfidf_features(bug_reports)
    X_distilbert = extract_distilbert_features(bug_reports)
    X_gcn = extract_gcn_features(bug_reports)  # Placeholder for GCN features
    X_rnn = extract_rnn_features(bug_reports)  # Placeholder for RNN features
    y = bug_reports['bug_class']

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    for train_index, test_index in skf.split(X_tfidf, y):
        X_train_tfidf, X_test_tfidf = X_tfidf[train_index], X_tfidf[test_index]
        X_train_distilbert, X_test_distilbert = X_distilbert[train_index], X_distilbert[test_index]
        X_train_gcn, X_test_gcn = X_gcn[train_index], X_gcn[test_index]
        X_train_rnn, X_test_rnn = X_rnn[train_index], X_rnn[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        for classifier_name in classifiers:
            if classifier_name == 'KNN':
                classifier = KNeighborsClassifier(n_neighbors=5)
            elif classifier_name == 'NB':
                continue  # Skip MultinomialNB for DistilBERT features
            elif classifier_name == 'NN':
                classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
            elif classifier_name == 'RF':
                classifier = RandomForestClassifier(n_estimators=100, random_state=42)
            elif classifier_name == 'SVM':
                classifier = SVC(kernel='linear', random_state=42)

            # Train and evaluate classifier using TF-IDF features
            result_tfidf = train_and_evaluate_classifier(X_train_tfidf, X_test_tfidf, y_train, y_test, classifier, 'TF-IDF')
            results.append(result_tfidf)

            # Train and evaluate classifier using DistilBERT features
            result_distilbert = train_and_evaluate_classifier(X_train_distilbert, X_test_distilbert, y_train, y_test, classifier, 'DistilBERT')
            results.append(result_distilbert)

            # Train and evaluate classifier using GCN features
            result_gcn = train_and_evaluate_classifier(X_train_gcn, X_test_gcn, y_train, y_test, classifier, 'GCN')
            results.append(result_gcn)

            # Train and evaluate classifier using RNN features
            result_rnn = train_and_evaluate_classifier(X_train_rnn, X_test_rnn, y_train, y_test, classifier, 'RNN')
            results.append(result_rnn)

# Create summary table
summary_table = create_summary_table(results)

# Print or display the summary table
print(summary_table)

# Optionally, save the summary table to a CSV file
summary_table.to_csv('classifier_comparison_summary.csv', index=False)