In [4]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Load your dataset
data = pd.read_csv("E:/Shuvo/Dataset_1.csv")
X = data.drop('label', axis=1)
y = data['label']

# Split the dataset into three partitions: 70:30, 80:20, 90:10
split_ratios = [0.7, 0.8, 0.9]

# Initialize empty lists to store results
accuracy_results = []
confusion_matrix_results = []
auc_results = []
training_testing_times = []

for ratio in split_ratios:
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - ratio, random_state=42)

    # Create base classifiers
    classifiers = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Support Vector Machine': SVC(probability=True, random_state=42),
        'XGBoost': XGBClassifier(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier()
    }
    
    # Ensemble model using VotingClassifier
    ensemble_model = VotingClassifier(
        estimators=[(name, clf) for name, clf in classifiers.items()],
        voting='soft'  # 'soft' uses predicted probabilities, 'hard' uses predicted class labels
    )

    # Add ensemble model to classifiers
    classifiers['Ensemble Model'] = ensemble_model

    for clf_name, clf in classifiers.items():
        # Training time measurement
        start_time = time.time()
        clf.fit(X_train, y_train)
        end_time = time.time()
        training_time = end_time - start_time

        # Testing time measurement
        start_time = time.time()
        y_pred = clf.predict(X_test)
        end_time = time.time()
        testing_time = end_time - start_time
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Accuracy': accuracy
        })

        # Calculate confusion matrix metrics
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        tpr = tp / (tp + fn)
        fnr = fn / (tp + fn)
        tnr = tn / (tn + fp)
        fpr = fp / (tn + fp)
        confusion_matrix_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'True Positive': tp,
            'False Negative': fn,
            'True Negative': tn,
            'False Positive': fp,
            'True Positive Rate': tpr,
            'False Negative Rate': fnr,
            'True Negative Rate': tnr,
            'False Positive Rate': fpr
        })

        # Record training and testing times
        training_testing_times.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Training Time (s)': training_time,
            'Testing Time (s)': testing_time
        })
        
        # Calculate AUC-ROC score (only for binary classification)
        if len(np.unique(y)) == 2:
            y_pred_proba = clf.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_pred_proba)
            auc_results.append({
                'Split Ratio': ratio,
                'Classifier': clf_name,
                'AUC-ROC': auc
            })

# Create DataFrames for accuracy, confusion matrix metrics, AUC values, training times, and testing times
accuracy_df = pd.DataFrame(accuracy_results)
confusion_matrix_df = pd.DataFrame(confusion_matrix_results)
auc_df = pd.DataFrame(auc_results) if auc_results else None
training_testing_times_df = pd.DataFrame(training_testing_times)


In [5]:
print("Accuracy:")
accuracy_df

Accuracy:


Unnamed: 0,Split Ratio,Classifier,Accuracy
0,0.7,Random Forest,0.731602
1,0.7,Gradient Boosting,0.722944
2,0.7,AdaBoost,0.753247
3,0.7,Naive Bayes,0.761905
4,0.7,Support Vector Machine,0.761905
5,0.7,XGBoost,0.753247
6,0.7,Decision Tree,0.69697
7,0.7,Logistic Regression,0.779221
8,0.7,K-Nearest Neighbors,0.692641
9,0.7,Ensemble Model,0.761905


In [6]:
print("\nConfusion Matrix Metrics:")
confusion_matrix_df


Confusion Matrix Metrics:


Unnamed: 0,Split Ratio,Classifier,True Positive,False Negative,True Negative,False Positive,True Positive Rate,False Negative Rate,True Negative Rate,False Positive Rate
0,0.7,Random Forest,48,32,121,30,0.6,0.4,0.801325,0.198675
1,0.7,Gradient Boosting,51,29,116,35,0.6375,0.3625,0.768212,0.231788
2,0.7,AdaBoost,49,31,125,26,0.6125,0.3875,0.827815,0.172185
3,0.7,Naive Bayes,52,28,124,27,0.65,0.35,0.821192,0.178808
4,0.7,Support Vector Machine,48,32,128,23,0.6,0.4,0.847682,0.152318
5,0.7,XGBoost,55,25,119,32,0.6875,0.3125,0.788079,0.211921
6,0.7,Decision Tree,47,33,114,37,0.5875,0.4125,0.754967,0.245033
7,0.7,Logistic Regression,46,34,134,17,0.575,0.425,0.887417,0.112583
8,0.7,K-Nearest Neighbors,39,41,121,30,0.4875,0.5125,0.801325,0.198675
9,0.7,Ensemble Model,52,28,124,27,0.65,0.35,0.821192,0.178808


In [7]:
print("\nTraining Times:")
training_testing_times_df


Training Times:


Unnamed: 0,Split Ratio,Classifier,Training Time (s),Testing Time (s)
0,0.7,Random Forest,0.353051,0.019947
1,0.7,Gradient Boosting,0.332152,0.001991
2,0.7,AdaBoost,0.174497,0.018953
3,0.7,Naive Bayes,0.001995,0.000997
4,0.7,Support Vector Machine,0.083811,0.011931
5,0.7,XGBoost,0.12862,0.011969
6,0.7,Decision Tree,0.017951,0.002994
7,0.7,Logistic Regression,0.012967,0.001993
8,0.7,K-Nearest Neighbors,0.00299,0.015961
9,0.7,Ensemble Model,1.128936,0.073803


In [8]:
print("AUC-ROC:")
auc_df

AUC-ROC:


Unnamed: 0,Split Ratio,Classifier,AUC-ROC
0,0.7,Random Forest,0.792632
1,0.7,Gradient Boosting,0.806291
2,0.7,AdaBoost,0.759892
3,0.7,Naive Bayes,0.809106
4,0.7,Support Vector Machine,0.817715
5,0.7,XGBoost,0.776904
6,0.7,Decision Tree,0.671233
7,0.7,Logistic Regression,0.815315
8,0.7,K-Nearest Neighbors,0.736589
9,0.7,Ensemble Model,0.8125
