In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('./datasets/cars.csv')
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
df.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [8]:
df["buying"].value_counts()

buying
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64

In [6]:
X = df.drop('class', axis=1)
y = df['class']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
dt_clf = DecisionTreeClassifier(random_state=42)
lr_clf = LogisticRegression(max_iter=1000, random_state=42)
svm_clf = SVC(probability=True, random_state=42)

In [None]:
dt_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
def print_metrics(y_true, y_pred, classifier_name):
    print(f"\nPerformance Metrics for {classifier_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.2f}")
    print(f"Sensitivity (Recall): {recall_score(y_true, y_pred, average='weighted'):.2f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted'):.2f}")
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm)

In [None]:
print_metrics(y_test, dt_clf.predict(X_test), "Decision Tree")
print_metrics(y_test, lr_clf.predict(X_test), "Logistic Regression")
print_metrics(y_test, svm_clf.predict(X_test), "SVM")

In [None]:
voting_clf = VotingClassifier(estimators=[
    ('dt', dt_clf), ('lr', lr_clf), ('svm', svm_clf)], voting='soft')

In [None]:
voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)

In [None]:
print_metrics(y_test, y_pred_voting, "Voting Ensemble")

In [None]:
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__gamma': [1, 0.1, 0.01],
    'svm__kernel': ['rbf', 'linear']
}
grid_search = GridSearchCV(voting_clf, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
print("\nBest parameters for Voting Ensemble:", grid_search.best_params_)
print(f"Best accuracy from Grid Search: {grid_search.best_score_:.2f}")

In [None]:
plt.figure(figsize=(14, 7))
for clf, name in zip([dt_clf, lr_clf, svm_clf, voting_clf], ['Decision Tree', 'Logistic Regression', 'SVM', 'Voting Ensemble']):
    y_pred_proba = clf.predict_proba(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1], pos_label=1)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()