In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("car.data", names=["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"])
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [8]:
df.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

### Encode

all of the attributes are objects, so it has to be transfered to categories

looks like all of them has ordinal meaning, so lets encode it by categories

In [28]:
df["doors"].value_counts()

doors
2        432
3        432
4        432
5more    432
Name: count, dtype: int64

In [29]:
df_enc = df.copy()
df_enc["buying"] = df["buying"].map({"low": 0, "med":1, "high": 2, "vhigh": 3})
df_enc["maint"] = df["maint"].map({"low": 0, "med":1, "high": 2, "vhigh": 3})
df_enc["doors"] = df["doors"].map({"2": 0, "3":1, "4": 2, "5more": 3})
df_enc["persons"] = df["persons"].map({"2": 0, "4": 1, "more": 2})
df_enc["lug_boot"] = df["lug_boot"].map({"small": 0, "med":1, "big": 2})
df_enc["safety"] = df["safety"].map({"low": 0, "med":1, "high": 2})


In [32]:
df_enc.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

### Make predictions

In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

In [35]:
X = df_enc.drop(["class"], axis=1)
y = df_enc["class"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

#### Model tuning

In [53]:
models = []
for i in range(10):
    np.random.seed(42 + i)
    max_depth = np.random.randint(1,10)
    criterion = np.random.choice(['gini', 'entropy', 'log_loss'])
    min_samples_split = np.random.randint(2,10)

    model = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion, min_samples_split=min_samples_split)
    models.append(model)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    res = f1_score(y_valid, y_pred, average="weighted")
    print(res)

0.8919909588365685
0.8690711733344948
0.8709213971641717
0.825861552153058
0.9128654004954584
0.9192385641903947
0.5088327755805699
0.9265212971937844
0.5088327755805699
0.9128654004954584


Looks like best model is the 8th, so take 7th index from models

In [55]:
best_model = models[7]
best_model

Lets test our model on test set

In [58]:
y_pred = model.predict(X_test)
res = f1_score(y_test, y_pred, average="weighted")
res

0.9314730391583884

The f1 score is even better than in validation set, nice!