In [49]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

#Assigning column names which are not included in the car.data file
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
df = pd.read_csv('car.data', header = None, names = column_names)
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


Given the requirements, we will not be using the 'persons' column as a feature for our model.

In [50]:
features = ["maint", "doors", "lug_boot", "safety", "class"]
X = df[features]
y = df["buying"]

In [51]:
print(y.value_counts())

buying
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64


There are no class imbalances.

In [52]:
X_encoded = pd.get_dummies(X)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [54]:
classifier = DecisionTreeClassifier(random_state = 42)
classifier.fit(X_train, y_train)

In [55]:
y_prediction = classifier.predict(X_test)
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.09      0.14      0.11        92
           1       0.17      0.22      0.19        83
           2       0.01      0.01      0.01        77
           3       0.11      0.03      0.05        94

    accuracy                           0.10       346
   macro avg       0.10      0.10      0.09       346
weighted avg       0.10      0.10      0.09       346



Performance is poor based on current training, perform hyperparameter tuning(GridSearch).

In [71]:
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [3, 5, 7, 9, 11, 13],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": [None, "sqrt", "log2"]
}

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    estimator = classifier,
    param_grid = param_grid,
    scoring = 'accuracy',
    cv = 20,
    n_jobs =-1
)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

best_model = grid_search.best_estimator_



{'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 2}
0.31259834368530015


In [72]:
new_y_pred = best_model.predict(X_test)
print(classification_report(y_test, new_y_pred))

              precision    recall  f1-score   support

           0       0.29      0.46      0.35        92
           1       0.22      0.25      0.23        83
           2       0.19      0.19      0.19        77
           3       0.46      0.13      0.20        94

    accuracy                           0.26       346
   macro avg       0.29      0.26      0.25       346
weighted avg       0.30      0.26      0.25       346



There is an improvement in the metrics after hyperparameter tuning.

In [58]:
prediction_sample = pd.DataFrame({
    "maint": ["high"],
    "doors": ["4"],
    "lug_boot": ["big"],
    "safety": ["high"],
    "class": ["good"]
})

prediction_sample_encoded = pd.get_dummies(prediction_sample)


prediction_sample_encoded = prediction_sample_encoded.reindex(columns=X_train.columns, fill_value=False)

predicted_numeric = best_model.predict(prediction_sample_encoded)
predicted_label = le.inverse_transform(predicted_numeric)

print(f"Based on the parameters, the buying price of the car is {predicted_label[0]}.")


Based on the parameters, the buying price of the car is low.
