## **Car Dataset**

In [None]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("car.data.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())


     buying  maint  doors persons lug_boot safety  class
0     vhigh  vhigh      2       2    small    low  unacc
1     vhigh  vhigh      2       2    small    med  unacc
2     vhigh  vhigh      2       2    small   high  unacc
3     vhigh  vhigh      2       2      med    low  unacc
4     vhigh  vhigh      2       2      med    med  unacc
...     ...    ...    ...     ...      ...    ...    ...
1723    low    low  5more    more      med    med   good
1724    low    low  5more    more      med   high  vgood
1725    low    low  5more    more      big    low  unacc
1726    low    low  5more    more      big    med   good
1727    low    low  5more    more      big   high  vgood

[1728 rows x 7 columns]

Checking NULL values:
 buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64


In [None]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

#label encoding features
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

for col in categorical_cols:
  data[col] = labelEncoder.fit_transform(data[col])

print(data)

Categorical Columns: ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


      buying  maint  doors  persons  lug_boot  safety  class
0          3      3      0        0         2       1      2
1          3      3      0        0         2       2      2
2          3      3      0        0         2       0      2
3          3      3      0        0         1       1      2
4          3      3      0        0         1       2      2
...      ...    ...    ...      ...       ...     ...    ...
1723       1      1      3        2         1       2      1
1724       1      1      3        2         1       0      3
1725       1      1      3        2         0       1      2
1726       1      1      3        2         0       2      1
1727       1      1      3        2         0       0      3

[1728 rows x 7 columns]


In [None]:
#selecting target variable and features
X = data.drop('class', axis = 1)
y = data['class']

#splitting data for training, validating and testing
X_train = X.iloc[:1209]
X_validation = X.iloc[1209:1468]
X_test = X.iloc[1468:]
print(X_validation)
y_train = y.iloc[:1209]
y_validation = y.iloc[1209:1468]
y_test = y.iloc[1468:]

      buying  maint  doors  persons  lug_boot  safety
1209       2      1      0        2         1       1
1210       2      1      0        2         1       2
1211       2      1      0        2         1       0
1212       2      1      0        2         0       1
1213       2      1      0        2         0       2
...      ...    ...    ...      ...       ...     ...
1463       1      0      2        0         1       0
1464       1      0      2        0         0       1
1465       1      0      2        0         0       2
1466       1      0      2        0         0       0
1467       1      0      2        1         2       1

[259 rows x 6 columns]


### **Evaluation Metrics**

In [None]:
#Accuracy
def calc_accuracy(y_true, y_pred):
  correct = 0
  total = len(y_true)

  for true_label, predicted_label in zip(y_true, y_pred):
    if true_label == predicted_label:
      correct += 1

  accuracy = correct / total
  return accuracy

#Confusion Matrix
def confusion_matrix(y_true, y_pred):
  TN, TP, FN, FP = 0, 0, 0, 0

  for true_label, predicted_label in zip(y_true, y_pred):
    if true_label == 0 and predicted_label == 0:
      TN += 1
    elif true_label == 0 and predicted_label == 1:
      FP += 1
    elif true_label == 1 and predicted_label == 0:
      FN += 1
    elif true_label == 1 and predicted_label == 1:
      TP += 1

  return (TN, TP, FN, FP)

#Average Precision
def avg_precision(y_true, y_pred):
  data = list(zip(y_true, y_pred))
  data.sort(key=lambda x: x[1], reverse=True)

  # Initialize variables
  num_positives = sum(y_true)
  num_examples = len(y_true)
  true_positives = 0
  precision_sum = 0
  recall_sum = 0

  # Calculate precision and recall at each threshold
  for i in range(num_examples):
    if data[i][0] == 1:
      true_positives += 1
      precision = true_positives / (i + 1)
      recall = true_positives / num_positives
      precision_sum += precision
      recall_sum += recall

  # Calculate Average Precision (AP) using the precision-recall curve
  if num_positives == 0:
    average_precision = 0
  else:
    average_precision = precision_sum / num_positives

  return average_precision


#Average Recall
def avg_recall(y_true, y_pred):
  total_positives = sum(y_true)  # Total number of positive samples
  thresholds = sorted(set(y_pred), reverse=True)  # Unique sorted thresholds

  recall_values = []
  for threshold in thresholds:
    y_pred_thresholded = [1 if pred >= threshold else 0 for pred in y_pred]
    true_positives = sum([1 for true, pred in zip(y_true, y_pred_thresholded) if true == 1 and pred == 1])
    recall = true_positives / total_positives
    recall_values.append(recall)

  average_recall = sum(recall_values) / len(recall_values)

  return average_recall

#Average F1-Score
def avg_f1(y_true, y_pred):
  total_positives = sum(y_true)  # Total number of positive samples
  thresholds = sorted(set(y_pred), reverse=True)  # Unique sorted thresholds

  f1_scores = []
  for threshold in thresholds:
    y_pred_thresholded = [1 if pred >= threshold else 0 for pred in y_pred]

  # Calculate precision and recall
  true_positives = sum([1 for true, pred in zip(y_true, y_pred_thresholded) if true == 1 and pred == 1])
  predicted_positives = sum(y_pred_thresholded)

  if predicted_positives == 0:
    precision = 0.0
  else:
    precision = true_positives / predicted_positives

  recall = true_positives / total_positives

  # Calculate F1-score
  if precision + recall == 0:
    f1 = 0.0
  else:
    f1 = 2 * (precision * recall) / (precision + recall)

  f1_scores.append(f1)

  average_f1_score = sum(f1_scores) / len(f1_scores)

  return average_f1_score

### **Decision Tree**

In [None]:
#Training the dataset
from sklearn.tree import DecisionTreeClassifier

#Define the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()

dt = dt_classifier.fit(X_train, y_train)

In [None]:
#Validating the model using validation set
y_pred = dt.predict(X_validation)

acc = calc_accuracy(y_validation, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_validation, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_validation, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_validation, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_validation, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.752895752895753
Average Precision:  0.0022793963841093528
Average Recall:  0.015903307888040712
Average F1-Score:  0.06134969325153374
Confusion Matrix:
TN: 18   FP: 0
FN: 15   TP: 5


In [None]:
#Validating
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for grid search
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Initialize GridSearchCV with cross-validation (e.g., 5-fold)
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV on the combined training and validation sets
X_train_new = X.iloc[:1468]
y_train_new = y.iloc[:1468]
grid_search.fit(X_train_new, y_train_new)

# Get the best model on the test set
best_dt_model = grid_search.best_estimator_

print(best_dt_model)

DecisionTreeClassifier(max_depth=7)


In [None]:
#Testing best model using test set
y_pred = best_dt_model.predict(X_test)

acc = calc_accuracy(y_test, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_test, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_test, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_test, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.7769230769230769
Average Precision:  0.017221224472013914
Average Recall:  0.04434250764525994
Average F1-Score:  0.13218390804597702
Confusion Matrix:
TN: 36   FP: 0
FN: 40   TP: 0


### **XGB Classifier**

In [None]:
#Training the dataset
import xgboost as xgb

#Define the XGBClassifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)

xgbc = xgb_classifier.fit(X_train, y_train)

In [None]:
#Validating the model using validation set
y_pred = xgbc.predict(X_validation)

acc = calc_accuracy(y_validation, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_validation, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_validation, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_validation, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_validation, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.7374517374517374
Average Precision:  0.0022786012242673767
Average Recall:  0.016963528413910092
Average F1-Score:  0.06134969325153374
Confusion Matrix:
TN: 19   FP: 0
FN: 20   TP: 0


In [None]:
#Validating
#Define the hyperparameter grid for grid searching
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_new, y_train_new)

# Get the best XGBClassifier model from GridSearchCV
best_xgb_model = grid_search.best_estimator_

print(best_xgb_model)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.001, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None, num_class=3,
              num_parallel_tree=None, ...)


In [None]:
#Testing best model using test set
y_pred = best_xgb_model.predict(X_test)

acc = calc_accuracy(y_test, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_test, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_test, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_test, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.6153846153846154
Average Precision:  0.010691224374171587
Average Recall:  0.052752293577981654
Average F1-Score:  0.13218390804597702
Confusion Matrix:
TN: 18   FP: 0
FN: 46   TP: 0
