In [3]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Load your dataset
data = pd.read_csv("E:/Shuvo/Dataset_3.csv")
X = data.drop('label', axis=1)
y = data['label']

# Split the dataset into three partitions: 70:30, 80:20, 90:10
split_ratios = [0.7, 0.8, 0.9]

# Initialize empty lists to store results
accuracy_results = []
confusion_matrix_results = []
auc_results = []
training_testing_times = []

for ratio in split_ratios:
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - ratio, random_state=42)

    # Create base classifiers
    classifiers = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Support Vector Machine': SVC(probability=True, random_state=42),
        'XGBoost': XGBClassifier(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }
    
    # Ensemble model using VotingClassifier
    ensemble_model = VotingClassifier(
        estimators=[(name, clf) for name, clf in classifiers.items()],
        voting='soft'  # 'soft' uses predicted probabilities, 'hard' uses predicted class labels
    )

    # Add ensemble model to classifiers
    classifiers['Ensemble Model'] = ensemble_model

    for clf_name, clf in classifiers.items():
        # Training time measurement
        start_time = time.time()
        clf.fit(X_train, y_train)
        end_time = time.time()
        training_time = end_time - start_time

        # Testing time measurement
        start_time = time.time()
        y_pred = clf.predict(X_test)
        end_time = time.time()
        testing_time = end_time - start_time
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Accuracy': accuracy
        })

        # Calculate confusion matrix metrics
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        tpr = tp / (tp + fn)
        fnr = fn / (tp + fn)
        tnr = tn / (tn + fp)
        fpr = fp / (tn + fp)
        confusion_matrix_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'True Positive': tp,
            'False Negative': fn,
            'True Negative': tn,
            'False Positive': fp,
            'True Positive Rate': tpr,
            'False Negative Rate': fnr,
            'True Negative Rate': tnr,
            'False Positive Rate': fpr
        })

        # Record training and testing times
        training_testing_times.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Training Time (s)': training_time,
            'Testing Time (s)': testing_time
        })
        
        # Calculate AUC-ROC score (only for binary classification)
        if len(np.unique(y)) == 2:
            y_pred_proba = clf.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_pred_proba)
            auc_results.append({
                'Split Ratio': ratio,
                'Classifier': clf_name,
                'AUC-ROC': auc
            })

# Create DataFrames for accuracy, confusion matrix metrics, AUC values, training times, and testing times
accuracy_df = pd.DataFrame(accuracy_results)
confusion_matrix_df = pd.DataFrame(confusion_matrix_results)
auc_df = pd.DataFrame(auc_results) if auc_results else None
training_testing_times_df = pd.DataFrame(training_testing_times)

In [4]:
print("Accuracy:")
accuracy_df

Accuracy:


Unnamed: 0,Split Ratio,Classifier,Accuracy
0,0.7,Random Forest,0.714286
1,0.7,Gradient Boosting,0.718615
2,0.7,AdaBoost,0.748918
3,0.7,Naive Bayes,0.744589
4,0.7,Support Vector Machine,0.748918
5,0.7,XGBoost,0.722944
6,0.7,Decision Tree,0.701299
7,0.7,Ensemble Model,0.731602
8,0.8,Random Forest,0.785714
9,0.8,Gradient Boosting,0.785714


In [6]:
print("\nConfusion Matrix Metrics:")
confusion_matrix_df


Confusion Matrix Metrics:


Unnamed: 0,Split Ratio,Classifier,True Positive,False Negative,True Negative,False Positive,True Positive Rate,False Negative Rate,True Negative Rate,False Positive Rate
0,0.7,Random Forest,46,34,119,32,0.575,0.425,0.788079,0.211921
1,0.7,Gradient Boosting,48,32,118,33,0.6,0.4,0.781457,0.218543
2,0.7,AdaBoost,48,32,125,26,0.6,0.4,0.827815,0.172185
3,0.7,Naive Bayes,48,32,124,27,0.6,0.4,0.821192,0.178808
4,0.7,Support Vector Machine,44,36,129,22,0.55,0.45,0.854305,0.145695
5,0.7,XGBoost,55,25,112,39,0.6875,0.3125,0.741722,0.258278
6,0.7,Decision Tree,46,34,116,35,0.575,0.425,0.768212,0.231788
7,0.7,Ensemble Model,48,32,121,30,0.6,0.4,0.801325,0.198675
8,0.8,Random Forest,39,16,82,17,0.709091,0.290909,0.828283,0.171717
9,0.8,Gradient Boosting,39,16,82,17,0.709091,0.290909,0.828283,0.171717


In [7]:
print("\nTraining Times:")
training_testing_times_df


Training Times:


Unnamed: 0,Split Ratio,Classifier,Training Time (s),Testing Time (s)
0,0.7,Random Forest,0.411859,0.022974
1,0.7,Gradient Boosting,0.259306,0.002991
2,0.7,AdaBoost,0.18946,0.021942
3,0.7,Naive Bayes,0.003029,0.002955
4,0.7,Support Vector Machine,0.126662,0.025931
5,0.7,XGBoost,0.114692,0.006982
6,0.7,Decision Tree,0.009974,0.002992
7,0.7,Ensemble Model,1.052181,0.120678
8,0.8,Random Forest,0.482708,0.024932
9,0.8,Gradient Boosting,0.272271,0.002993


In [8]:
print("AUC-ROC:")
auc_df

AUC-ROC:


Unnamed: 0,Split Ratio,Classifier,AUC-ROC
0,0.7,Random Forest,0.774834
1,0.7,Gradient Boosting,0.792674
2,0.7,AdaBoost,0.769454
3,0.7,Naive Bayes,0.800166
4,0.7,Support Vector Machine,0.793046
5,0.7,XGBoost,0.778311
6,0.7,Decision Tree,0.671606
7,0.7,Ensemble Model,0.798013
8,0.8,Random Forest,0.831864
9,0.8,Gradient Boosting,0.849128
