In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

data = pd.read_csv("E:/R(5)/Dataset_7.csv")
X = data.drop('label', axis=1)
y = data['label']

split_ratios = [0.7, 0.8, 0.9]

accuracy_results = []
confusion_matrix_results = []
auc_results = []
training_testing_times = []

for ratio in split_ratios:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - ratio, random_state=42)

    classifiers = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Support Vector Machine': SVC(probability=True),
        'XGBoost': XGBClassifier(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }

    for clf_name, clf in classifiers.items():
        start_time = time.time()
        clf.fit(X_train, y_train)
        end_time = time.time()
        training_time = end_time - start_time

        start_time = time.time()
        y_pred = clf.predict(X_test)
        end_time = time.time()
        testing_time = end_time - start_time
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Accuracy': accuracy
        })
        
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        tpr = tp / (tp + fn)
        fnr = fn / (tp + fn)
        tnr = tn / (tn + fp)
        fpr = fp / (tn + fp)
        confusion_matrix_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'True Positive': tp,
            'False Negative': fn,
            'True Negative': tn,
            'False Positive': fp,
            'True Positive Rate': tpr,
            'False Negative Rate': fnr,
            'True Negative Rate': tnr,
            'False Positive Rate': fpr
        })
        
        try:
            probas = clf.predict_proba(X_test)
            auc_value = roc_auc_score(y_test, probas[:, 1])
        except AttributeError:
            auc_value = None
        auc_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'AUC Value': auc_value
        })

        training_testing_times.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Training Time (s)': training_time,
            'Testing Time (s)': testing_time
        })
accuracy_df = pd.DataFrame(accuracy_results)
confusion_matrix_df = pd.DataFrame(confusion_matrix_results)
auc_df = pd.DataFrame(auc_results)
training_testing_times_df = pd.DataFrame(training_testing_times)

In [2]:
print("Accuracy:")
accuracy_df

Accuracy:


Unnamed: 0,Split Ratio,Classifier,Accuracy
0,0.7,Random Forest,0.950413
1,0.7,Gradient Boosting,0.933884
2,0.7,AdaBoost,0.966942
3,0.7,Naive Bayes,0.92562
4,0.7,Support Vector Machine,0.942149
5,0.7,XGBoost,0.942149
6,0.7,Decision Tree,0.933884
7,0.8,Random Forest,0.9375
8,0.8,Gradient Boosting,0.925
9,0.8,AdaBoost,0.975


In [3]:
print("\nConfusion Matrix Metrics:")
confusion_matrix_df


Confusion Matrix Metrics:


Unnamed: 0,Split Ratio,Classifier,True Positive,False Negative,True Negative,False Positive,True Positive Rate,False Negative Rate,True Negative Rate,False Positive Rate
0,0.7,Random Forest,72,5,43,1,0.935065,0.064935,0.977273,0.022727
1,0.7,Gradient Boosting,71,6,42,2,0.922078,0.077922,0.954545,0.045455
2,0.7,AdaBoost,75,2,42,2,0.974026,0.025974,0.954545,0.045455
3,0.7,Naive Bayes,72,5,40,4,0.935065,0.064935,0.909091,0.090909
4,0.7,Support Vector Machine,70,7,44,0,0.909091,0.090909,1.0,0.0
5,0.7,XGBoost,72,5,42,2,0.935065,0.064935,0.954545,0.045455
6,0.7,Decision Tree,72,5,41,3,0.935065,0.064935,0.931818,0.068182
7,0.8,Random Forest,48,4,27,1,0.923077,0.076923,0.964286,0.035714
8,0.8,Gradient Boosting,47,5,27,1,0.903846,0.096154,0.964286,0.035714
9,0.8,AdaBoost,50,2,28,0,0.961538,0.038462,1.0,0.0


In [4]:
print("\nAUC Values:")
auc_df


AUC Values:


Unnamed: 0,Split Ratio,Classifier,AUC Value
0,0.7,Random Forest,0.997048
1,0.7,Gradient Boosting,0.992916
2,0.7,AdaBoost,0.997344
3,0.7,Naive Bayes,0.969599
4,0.7,Support Vector Machine,0.987603
5,0.7,XGBoost,0.986718
6,0.7,Decision Tree,0.933442
7,0.8,Random Forest,0.995879
8,0.8,Gradient Boosting,0.989011
9,0.8,AdaBoost,0.995879


In [5]:
print("\nTraining Times:")
training_testing_times_df


Training Times:


Unnamed: 0,Split Ratio,Classifier,Training Time (s),Testing Time (s)
0,0.7,Random Forest,0.290226,0.023969
1,0.7,Gradient Boosting,0.299198,0.001996
2,0.7,AdaBoost,0.207964,0.022939
3,0.7,Naive Bayes,0.004986,0.001995
4,0.7,Support Vector Machine,0.023943,0.002978
5,0.7,XGBoost,0.097736,0.006981
6,0.7,Decision Tree,0.007979,0.002992
7,0.8,Random Forest,0.356499,0.027929
8,0.8,Gradient Boosting,0.304189,0.002989
9,0.8,AdaBoost,0.233339,0.04887
