In [3]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Load your dataset
data = pd.read_csv("E:/Shuvo/Dataset_2.csv")
X = data.drop('label', axis=1)
y = data['label']

# Split the dataset into three partitions: 70:30, 80:20, 90:10
split_ratios = [0.7, 0.8, 0.9]

# Initialize empty lists to store results
accuracy_results = []
confusion_matrix_results = []
auc_results = []
training_testing_times = []

for ratio in split_ratios:
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - ratio, random_state=42)

    # Create base classifiers
    classifiers = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Support Vector Machine': SVC(probability=True, random_state=42),
        'XGBoost': XGBClassifier(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }
    
    # Ensemble model using VotingClassifier
    ensemble_model = VotingClassifier(
        estimators=[(name, clf) for name, clf in classifiers.items()],
        voting='soft'  # 'soft' uses predicted probabilities, 'hard' uses predicted class labels
    )

    # Add ensemble model to classifiers
    classifiers['Ensemble Model'] = ensemble_model

    for clf_name, clf in classifiers.items():
        # Training time measurement
        start_time = time.time()
        clf.fit(X_train, y_train)
        end_time = time.time()
        training_time = end_time - start_time

        # Testing time measurement
        start_time = time.time()
        y_pred = clf.predict(X_test)
        end_time = time.time()
        testing_time = end_time - start_time
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Accuracy': accuracy
        })

        # Calculate confusion matrix metrics
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        tpr = tp / (tp + fn)
        fnr = fn / (tp + fn)
        tnr = tn / (tn + fp)
        fpr = fp / (tn + fp)
        confusion_matrix_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'True Positive': tp,
            'False Negative': fn,
            'True Negative': tn,
            'False Positive': fp,
            'True Positive Rate': tpr,
            'False Negative Rate': fnr,
            'True Negative Rate': tnr,
            'False Positive Rate': fpr
        })

        # Record training and testing times
        training_testing_times.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Training Time (s)': training_time,
            'Testing Time (s)': testing_time
        })
        
        # Calculate AUC-ROC score (only for binary classification)
        if len(np.unique(y)) == 2:
            y_pred_proba = clf.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_pred_proba)
            auc_results.append({
                'Split Ratio': ratio,
                'Classifier': clf_name,
                'AUC-ROC': auc
            })

# Create DataFrames for accuracy, confusion matrix metrics, AUC values, training times, and testing times
accuracy_df = pd.DataFrame(accuracy_results)
confusion_matrix_df = pd.DataFrame(confusion_matrix_results)
auc_df = pd.DataFrame(auc_results) if auc_results else None
training_testing_times_df = pd.DataFrame(training_testing_times)

In [4]:
print("Accuracy:")
accuracy_df

Accuracy:


Unnamed: 0,Split Ratio,Classifier,Accuracy
0,0.7,Random Forest,0.744589
1,0.7,Gradient Boosting,0.718615
2,0.7,AdaBoost,0.766234
3,0.7,Naive Bayes,0.731602
4,0.7,Support Vector Machine,0.770563
5,0.7,XGBoost,0.701299
6,0.7,Decision Tree,0.69697
7,0.7,Ensemble Model,0.766234
8,0.8,Random Forest,0.772727
9,0.8,Gradient Boosting,0.733766


In [5]:
print("\nConfusion Matrix Metrics:")
confusion_matrix_df


Confusion Matrix Metrics:


Unnamed: 0,Split Ratio,Classifier,True Positive,False Negative,True Negative,False Positive,True Positive Rate,False Negative Rate,True Negative Rate,False Positive Rate
0,0.7,Random Forest,53,27,119,32,0.6625,0.3375,0.788079,0.211921
1,0.7,Gradient Boosting,53,27,113,38,0.6625,0.3375,0.748344,0.251656
2,0.7,AdaBoost,53,27,124,27,0.6625,0.3375,0.821192,0.178808
3,0.7,Naive Bayes,47,33,122,29,0.5875,0.4125,0.807947,0.192053
4,0.7,Support Vector Machine,49,31,129,22,0.6125,0.3875,0.854305,0.145695
5,0.7,XGBoost,54,26,108,43,0.675,0.325,0.715232,0.284768
6,0.7,Decision Tree,53,27,108,43,0.6625,0.3375,0.715232,0.284768
7,0.7,Ensemble Model,56,24,121,30,0.7,0.3,0.801325,0.198675
8,0.8,Random Forest,39,16,80,19,0.709091,0.290909,0.808081,0.191919
9,0.8,Gradient Boosting,35,20,78,21,0.636364,0.363636,0.787879,0.212121


In [6]:
print("\nTraining Times:")
training_testing_times_df


Training Times:


Unnamed: 0,Split Ratio,Classifier,Training Time (s),Testing Time (s)
0,0.7,Random Forest,0.539555,0.028923
1,0.7,Gradient Boosting,0.374995,0.001995
2,0.7,AdaBoost,0.231379,0.022937
3,0.7,Naive Bayes,0.003989,0.001996
4,0.7,Support Vector Machine,0.101729,0.022938
5,0.7,XGBoost,0.128653,0.008976
6,0.7,Decision Tree,0.012966,0.001995
7,0.7,Ensemble Model,1.186819,0.109706
8,0.8,Random Forest,0.462759,0.016955
9,0.8,Gradient Boosting,0.314155,0.001995


In [7]:
print("AUC-ROC:")
auc_df

AUC-ROC:


Unnamed: 0,Split Ratio,Classifier,AUC-ROC
0,0.7,Random Forest,0.802194
1,0.7,Gradient Boosting,0.80894
2,0.7,AdaBoost,0.828022
3,0.7,Naive Bayes,0.792715
4,0.7,Support Vector Machine,0.814983
5,0.7,XGBoost,0.787997
6,0.7,Decision Tree,0.688866
7,0.7,Ensemble Model,0.80745
8,0.8,Random Forest,0.826814
9,0.8,Gradient Boosting,0.803122
