In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

data = pd.read_csv("E:/R(5)/Dataset_6.csv")
X = data.drop('label', axis=1)
y = data['label']

split_ratios = [0.7, 0.8, 0.9]

accuracy_results = []
confusion_matrix_results = []
auc_results = []
training_testing_times = []

for ratio in split_ratios:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - ratio, random_state=42)

    classifiers = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Support Vector Machine': SVC(probability=True),
        'XGBoost': XGBClassifier(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }

    for clf_name, clf in classifiers.items():
        start_time = time.time()
        clf.fit(X_train, y_train)
        end_time = time.time()
        training_time = end_time - start_time

        start_time = time.time()
        y_pred = clf.predict(X_test)
        end_time = time.time()
        testing_time = end_time - start_time
        
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Accuracy': accuracy
        })
        
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        tpr = tp / (tp + fn)
        fnr = fn / (tp + fn)
        tnr = tn / (tn + fp)
        fpr = fp / (tn + fp)
        confusion_matrix_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'True Positive': tp,
            'False Negative': fn,
            'True Negative': tn,
            'False Positive': fp,
            'True Positive Rate': tpr,
            'False Negative Rate': fnr,
            'True Negative Rate': tnr,
            'False Positive Rate': fpr
        })
        
        try:
            probas = clf.predict_proba(X_test)
            auc_value = roc_auc_score(y_test, probas[:, 1])
        except AttributeError:
            auc_value = None
        auc_results.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'AUC Value': auc_value
        })

        training_testing_times.append({
            'Split Ratio': ratio,
            'Classifier': clf_name,
            'Training Time (s)': training_time,
            'Testing Time (s)': testing_time
        })

accuracy_df = pd.DataFrame(accuracy_results)
confusion_matrix_df = pd.DataFrame(confusion_matrix_results)
auc_df = pd.DataFrame(auc_results)
training_testing_times_df = pd.DataFrame(training_testing_times)

In [2]:
print("Accuracy:")
accuracy_df

Accuracy:


Unnamed: 0,Split Ratio,Classifier,Accuracy
0,0.7,Random Forest,0.983471
1,0.7,Gradient Boosting,0.983471
2,0.7,AdaBoost,0.983471
3,0.7,Naive Bayes,0.942149
4,0.7,Support Vector Machine,0.942149
5,0.7,XGBoost,0.975207
6,0.7,Decision Tree,0.950413
7,0.8,Random Forest,0.975
8,0.8,Gradient Boosting,0.9875
9,0.8,AdaBoost,0.975


In [3]:
print("\nConfusion Matrix Metrics:")
confusion_matrix_df


Confusion Matrix Metrics:


Unnamed: 0,Split Ratio,Classifier,True Positive,False Negative,True Negative,False Positive,True Positive Rate,False Negative Rate,True Negative Rate,False Positive Rate
0,0.7,Random Forest,76,1,43,1,0.987013,0.012987,0.977273,0.022727
1,0.7,Gradient Boosting,76,1,43,1,0.987013,0.012987,0.977273,0.022727
2,0.7,AdaBoost,75,2,44,0,0.974026,0.025974,1.0,0.0
3,0.7,Naive Bayes,72,5,42,2,0.935065,0.064935,0.954545,0.045455
4,0.7,Support Vector Machine,70,7,44,0,0.909091,0.090909,1.0,0.0
5,0.7,XGBoost,75,2,43,1,0.974026,0.025974,0.977273,0.022727
6,0.7,Decision Tree,74,3,41,3,0.961039,0.038961,0.931818,0.068182
7,0.8,Random Forest,51,1,27,1,0.980769,0.019231,0.964286,0.035714
8,0.8,Gradient Boosting,52,0,27,1,1.0,0.0,0.964286,0.035714
9,0.8,AdaBoost,51,1,27,1,0.980769,0.019231,0.964286,0.035714


In [4]:
print("\nAUC Values:")
auc_df


AUC Values:


Unnamed: 0,Split Ratio,Classifier,AUC Value
0,0.7,Random Forest,0.999115
1,0.7,Gradient Boosting,0.998524
2,0.7,AdaBoost,0.99941
3,0.7,Naive Bayes,0.972255
4,0.7,Support Vector Machine,0.987898
5,0.7,XGBoost,0.99941
6,0.7,Decision Tree,0.946429
7,0.8,Random Forest,0.998626
8,0.8,Gradient Boosting,0.99794
9,0.8,AdaBoost,0.999313


In [5]:
print("\nTraining Times:")
training_testing_times_df


Training Times:


Unnamed: 0,Split Ratio,Classifier,Training Time (s),Testing Time (s)
0,0.7,Random Forest,0.329635,0.023936
1,0.7,Gradient Boosting,0.355897,0.003988
2,0.7,AdaBoost,0.277303,0.029922
3,0.7,Naive Bayes,0.004987,0.002992
4,0.7,Support Vector Machine,0.029918,0.002991
5,0.7,XGBoost,0.090758,0.005986
6,0.7,Decision Tree,0.006981,0.002991
7,0.8,Random Forest,0.43384,0.022939
8,0.8,Gradient Boosting,0.460795,0.00299
9,0.8,AdaBoost,0.264811,0.030918
