In [None]:
# Swapnil Saha Shawon (2022533042)
# Tamanna Rahman (2021450642)
# Syeda Mashiat Tabassum (2031356642)

## **Car Dataset**

In [None]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("car.data.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())

     buying  maint  doors persons lug_boot safety  class
0     vhigh  vhigh      2       2    small    low  unacc
1     vhigh  vhigh      2       2    small    med  unacc
2     vhigh  vhigh      2       2    small   high  unacc
3     vhigh  vhigh      2       2      med    low  unacc
4     vhigh  vhigh      2       2      med    med  unacc
...     ...    ...    ...     ...      ...    ...    ...
1723    low    low  5more    more      med    med   good
1724    low    low  5more    more      med   high  vgood
1725    low    low  5more    more      big    low  unacc
1726    low    low  5more    more      big    med   good
1727    low    low  5more    more      big   high  vgood

[1728 rows x 7 columns]

Checking NULL values:
 buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64


In [None]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

#label encoding features
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

for col in categorical_cols:
  data[col] = labelEncoder.fit_transform(data[col])

print(data)

Categorical Columns: ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


      buying  maint  doors  persons  lug_boot  safety  class
0          3      3      0        0         2       1      2
1          3      3      0        0         2       2      2
2          3      3      0        0         2       0      2
3          3      3      0        0         1       1      2
4          3      3      0        0         1       2      2
...      ...    ...    ...      ...       ...     ...    ...
1723       1      1      3        2         1       2      1
1724       1      1      3        2         1       0      3
1725       1      1      3        2         0       1      2
1726       1      1      3        2         0       2      1
1727       1      1      3        2         0       0      3

[1728 rows x 7 columns]


In [None]:
#selecting target variable and features
X = data.drop('class', axis = 1)
y = data['class']

#splitting data for training, validating and testing
X_train = X.iloc[:1209]
X_validation = X.iloc[1209:1468]
X_test = X.iloc[1468:]
print(X_validation)
y_train = y.iloc[:1209]
y_validation = y.iloc[1209:1468]
y_test = y.iloc[1468:]

      buying  maint  doors  persons  lug_boot  safety
1209       2      1      0        2         1       1
1210       2      1      0        2         1       2
1211       2      1      0        2         1       0
1212       2      1      0        2         0       1
1213       2      1      0        2         0       2
...      ...    ...    ...      ...       ...     ...
1463       1      0      2        0         1       0
1464       1      0      2        0         0       1
1465       1      0      2        0         0       2
1466       1      0      2        0         0       0
1467       1      0      2        1         2       1

[259 rows x 6 columns]


### **Evaluation Metrics**

In [None]:
#Accuracy
def calc_accuracy(y_true, y_pred):
  correct = 0
  total = len(y_true)

  for true_label, predicted_label in zip(y_true, y_pred):
    if true_label == predicted_label:
      correct += 1

  accuracy = correct / total
  return accuracy

#Confusion Matrix
def confusion_matrix(y_true, y_pred):
  TN, TP, FN, FP = 0, 0, 0, 0

  for true_label, predicted_label in zip(y_true, y_pred):
    if true_label == 0 and predicted_label == 0:
      TN += 1
    elif true_label == 0 and predicted_label == 1:
      FP += 1
    elif true_label == 1 and predicted_label == 0:
      FN += 1
    elif true_label == 1 and predicted_label == 1:
      TP += 1

  return (TN, TP, FN, FP)

#Average Precision
def avg_precision(y_true, y_pred):
  data = list(zip(y_true, y_pred))
  data.sort(key=lambda x: x[1], reverse=True)

  # Initialize variables
  num_positives = sum(y_true)
  num_examples = len(y_true)
  true_positives = 0
  precision_sum = 0
  recall_sum = 0

  # Calculate precision and recall at each threshold
  for i in range(num_examples):
    if data[i][0] == 1:
      true_positives += 1
      precision = true_positives / (i + 1)
      recall = true_positives / num_positives
      precision_sum += precision
      recall_sum += recall

  # Calculate Average Precision (AP) using the precision-recall curve
  if num_positives == 0:
    average_precision = 0
  else:
    average_precision = precision_sum / num_positives

  return average_precision


#Average Recall
def avg_recall(y_true, y_pred):
  total_positives = sum(y_true)  # Total number of positive samples
  thresholds = sorted(set(y_pred), reverse=True)  # Unique sorted thresholds

  recall_values = []
  for threshold in thresholds:
    y_pred_thresholded = [1 if pred >= threshold else 0 for pred in y_pred]
    true_positives = sum([1 for true, pred in zip(y_true, y_pred_thresholded) if true == 1 and pred == 1])
    recall = true_positives / total_positives
    recall_values.append(recall)

  average_recall = sum(recall_values) / len(recall_values)

  return average_recall

#Average F1-Score
def avg_f1(y_true, y_pred):
  total_positives = sum(y_true)  # Total number of positive samples
  thresholds = sorted(set(y_pred), reverse=True)  # Unique sorted thresholds

  f1_scores = []
  for threshold in thresholds:
    y_pred_thresholded = [1 if pred >= threshold else 0 for pred in y_pred]

  # Calculate precision and recall
  true_positives = sum([1 for true, pred in zip(y_true, y_pred_thresholded) if true == 1 and pred == 1])
  predicted_positives = sum(y_pred_thresholded)

  if predicted_positives == 0:
    precision = 0.0
  else:
    precision = true_positives / predicted_positives

  recall = true_positives / total_positives

  # Calculate F1-score
  if precision + recall == 0:
    f1 = 0.0
  else:
    f1 = 2 * (precision * recall) / (precision + recall)

  f1_scores.append(f1)

  average_f1_score = sum(f1_scores) / len(f1_scores)

  return average_f1_score

### **ZeroR Classifier**

In [None]:
# Build ZeroR Classifier
from collections import Counter

class ZeroR:
    def __init__(self):
        self.majority_class = None

    def fit(self, X, y):
        # Find the majority class
        self.majority_class = Counter(y).most_common(1)[0][0]

    def predict(self, X):
        # Return the majority class for all instances
        return [self.majority_class] * len(X)


# Initialize the ZeroR model
zr = ZeroR()

# Train the model
zr.fit(X_train, y_train)


In [None]:
# Evaluating with Training set
y_pred = zr.predict(X_train)

acc = calc_accuracy(y_train, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_train, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_train, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_train, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.7485525227460711
Average Precision:  2.6875850566252678e-06
Average Recall:  0.0016172506738544475
Average F1-Score:  0.0019582245430809398
Confusion Matrix:
TN: 0   FP: 0
FN: 0   TP: 0


In [None]:
# Evaluating with Validation set
y_pred = zr.predict(X_validation)

acc = calc_accuracy(y_validation, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_validation, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_validation, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_validation, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_validation, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.6216216216216216
Average Precision:  0.011895452912296833
Average Recall:  0.05089058524173028
Average F1-Score:  0.06134969325153374
Confusion Matrix:
TN: 0   FP: 0
FN: 0   TP: 0


In [None]:
# Evaluating with Testing set
y_pred = zr.predict(X_test)

acc = calc_accuracy(y_test, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_test, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_test, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_test, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.5538461538461539
Average Precision:  0.014082758306997751
Average Recall:  0.10550458715596331
Average F1-Score:  0.13218390804597702
Confusion Matrix:
TN: 0   FP: 0
FN: 0   TP: 0


### **OneR Classifier**

In [None]:
from collections import defaultdict

class OneRClassifier:
    def __init__(self):
        self.best_attribute = None
        self.mapping = {}

    def fit(self, X, y):
        if len(X) != len(y):
            raise ValueError("Number of samples in X and y must be the same")

        min_error = float('inf')

        for i in range(len(X[0])):
            attribute_values = defaultdict(list)
            for j in range(len(X)):
                attribute_values[X[j][i]].append(y[j])

            error = sum(len(attribute_values[val]) - max(self.count_classes(attribute_values[val]).values()) for val in attribute_values)

            if error < min_error:
                min_error = error
                self.best_attribute = i
                self.mapping = {val: max(self.count_classes(attribute_values[val]), key=self.count_classes(attribute_values[val]).get) for val in attribute_values}

    def predict(self, X):
        if self.best_attribute is None:
            raise ValueError("The model has not been trained yet. Call fit() before predict()")

        predictions = []
        for sample in X:
            predictions.append(self.mapping[sample[self.best_attribute]])

        return predictions

    def count_classes(self, classes):
        counts = defaultdict(int)
        for c in classes:
            counts[c] += 1
        return counts

# Initialize the OneR model
oner_model = OneRClassifier()

y_train_bool = [bool(val) for val in y_train]

# Train the model
oner_model.fit(X_train, y_train)

# Make predictions
predictions = oner_model.predict(X_test)


KeyError: 0

### **K-Nearest-Neighbor Classifiers**

In [None]:
# Training the dataset
from sklearn.neighbors import KNeighborsClassifier

#Define the KNN Classifier
knn_classifier = KNeighborsClassifier()

knn = knn_classifier.fit(X_train, y_train)

In [None]:
#Validating the model using training set
y_pred = knn.predict(X_train)

acc = calc_accuracy(y_train, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_train, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_train, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_train, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.9487179487179487
Average Precision:  3.024655875698897e-06
Average Recall:  0.0006738544474393531
Average F1-Score:  0.0019582245430809398
Confusion Matrix:
TN: 238   FP: 0
FN: 1   TP: 2


In [None]:
#Validating the model using validation set
y_pred = knn.predict(X_validation)

acc = calc_accuracy(y_validation, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_validation, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_validation, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_validation, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_validation, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.7451737451737451
Average Precision:  0.0025423228051409076
Average Recall:  0.02544529262086514
Average F1-Score:  0.06134969325153374
Confusion Matrix:
TN: 32   FP: 0
FN: 20   TP: 0


In [None]:
#Validating
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for grid search
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize GridSearchCV with cross-validation (e.g., 5-fold)
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV on the combined training and validation sets
X_train_new = X.iloc[:74]
y_train_new = y.iloc[:74]
grid_search.fit(X_train_new, y_train_new)

# Get the best model on the test set
best_knn_model = grid_search.best_estimator_

print(best_knn_model)

KNeighborsClassifier(metric='euclidean', n_neighbors=3)


In [None]:
#Testing best model using test set
y_pred = knn.predict(X_test)

acc = calc_accuracy(y_test, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_test, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_test, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_test, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.6
Average Precision:  0.010657299012418442
Average Recall:  0.05504587155963303
Average F1-Score:  0.13218390804597702
Confusion Matrix:
TN: 12   FP: 0
FN: 44   TP: 0


### **Naive Bayesian Classifier**

In [None]:
#Training the dataset
from sklearn.naive_bayes import GaussianNB

nb_classifier = GaussianNB()
nb = nb_classifier.fit(X_train, y_train)

In [None]:
#Validating the model using training set
y_pred = nb.predict(X_train)

acc = calc_accuracy(y_train, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_train, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_train, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_train, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.7741935483870968
Average Precision:  2.9824899027571626e-06
Average Recall:  0.0008086253369272237
Average F1-Score:  0.0019582245430809398
Confusion Matrix:
TN: 85   FP: 2
FN: 0   TP: 3


In [None]:
#Validating the model using validation set
y_pred = nb.predict(X_validation)

acc = calc_accuracy(y_validation, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_validation, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_validation, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_validation, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_validation, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.6486486486486487
Average Precision:  0.014052520714109732
Average Recall:  0.037319762510602206
Average F1-Score:  0.06134969325153374
Confusion Matrix:
TN: 8   FP: 0
FN: 4   TP: 0


In [None]:
#Testing best model using test set
y_pred = nb.predict(X_test)

acc = calc_accuracy(y_test, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_test, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_test, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_test, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.5307692307692308
Average Precision:  0.013981771841030336
Average Recall:  0.0871559633027523
Average F1-Score:  0.13218390804597702
Confusion Matrix:
TN: 5   FP: 0
FN: 16   TP: 0


### **Support vector machine (SVM)**

In [None]:
# Training the dataset
from sklearn.svm import SVC

#Define the SVC Classifier
svm_classifier = SVC()

svm = svm_classifier.fit(X_train, y_train)

In [None]:
#Validating the model using training set
y_pred = svm.predict(X_train)

acc = calc_accuracy(y_train, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_train, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_train, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_train, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.9330024813895782
Average Precision:  2.681267684164944e-06
Average Recall:  0.0008086253369272237
Average F1-Score:  0.0019582245430809398
Confusion Matrix:
TN: 234   FP: 0
FN: 3   TP: 0


In [None]:
#Validating the model using validation set
y_pred = svm.predict(X_validation)

acc = calc_accuracy(y_validation, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_validation, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_validation, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_validation, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_validation, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.6756756756756757
Average Precision:  0.002482398848584853
Average Recall:  0.02544529262086514
Average F1-Score:  0.06134969325153374
Confusion Matrix:
TN: 23   FP: 0
FN: 20   TP: 0


In [None]:
#Testing best model using test set
y_pred = svm.predict(X_test)

acc = calc_accuracy(y_test, y_pred)
print("Accuracy: ", acc)
ap = avg_precision(y_test, y_pred)
print("Average Precision: ", ap)
ar = avg_recall(y_test, y_pred)
print("Average Recall: ", ar)
af = avg_f1(y_test, y_pred)
print("Average F1-Score: ", af)
TN, TP, FN, FP = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"TN: {TN}   FP: {FP}")
print(f"FN: {FN}   TP: {TP}")

Accuracy:  0.5538461538461539
Average Precision:  0.01091128877792928
Average Recall:  0.052752293577981654
Average F1-Score:  0.13218390804597702
Confusion Matrix:
TN: 14   FP: 0
FN: 46   TP: 0
