## Car Evaluation

This dataset is composed of 1728 records and 6 different attributes which are buying price, price of maintenance, number of doors, capacity in terms of persons to carry, the relative size of luggage boot and the estimated safety value of each car. There are no missing values ​​in the dataset, which is an advantage!

In [None]:
!pip install plotly flaml\[notebook]

In [2]:
# Built-in libraries
import pickle
from pathlib import Path

# Data analysis
import pandas as pd
import plotly.express as px

# Machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from flaml import AutoML

In [3]:
for path in Path('./datasets').rglob('*'):
    print(path.name)

titanic.csv
car_evaluation.csv
netflix.csv
used_car_prices.csv
youtube_trends_us.csv


In [4]:
df = pd.read_csv('./datasets/car_evaluation.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
df.rename(columns={
    0: 'buying',
    1: 'maint',
    2: 'doors',
    3: 'persons',
    4: 'lug_boot',
    5: 'safety',
    6: 'class',
}, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [7]:
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [10]:
df.duplicated().sum()

0

In [8]:
df.nunique()

buying      4
maint       4
doors       4
persons     3
lug_boot    3
safety      3
class       4
dtype: int64

In [11]:
df.groupby(by=['class']).agg({
    'buying': ['min', 'max'],
    'maint': ['min', 'max'],
    'lug_boot': ['min', 'max'],
    'safety': ['min', 'max'],
})

Unnamed: 0_level_0,buying,buying,maint,maint,lug_boot,lug_boot,safety,safety
Unnamed: 0_level_1,min,max,min,max,min,max,min,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
acc,high,vhigh,high,vhigh,big,small,high,med
good,low,med,low,med,big,small,high,med
unacc,high,vhigh,high,vhigh,big,small,high,med
vgood,low,med,high,med,big,med,high,high


In [12]:
px.histogram(data_frame=df, x='class', color='safety', title='Car Class Distribution Chart')

In [13]:
le = LabelEncoder()
for col in df.columns:
    print(f'Transforming column `{col}`...')
    df[col] = le.fit_transform(df[col])

Transforming column `buying`...
Transforming column `maint`...
Transforming column `doors`...
Transforming column `persons`...
Transforming column `lug_boot`...
Transforming column `safety`...
Transforming column `class`...


Correlations after encoding the labels:

In [16]:
px.imshow(df.corr())

In [18]:
X = df.drop(columns=['class'])
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1382, 6), (1382,), (346, 6), (346,))

I want to use `FLAML` to figure out the best model on this dataset!

In [25]:
with open('./exports/car_evaluation_model.pkl', 'rb') as f:
    automl = pickle.load(f)
automl

AutoML(append_log=False, auto_augment=True, custom_hp={},
       cv_score_agg_func=None, early_stop=False, ensemble=False,
       estimator_list='auto', eval_method='auto', fit_kwargs_by_estimator={},
       hpo_method='auto', keep_search_state=False, learner_selector='sample',
       log_file_name='', log_training_metric=False, log_type='better',
       max_iter=None, mem_thres=4294967296, metric='auto',
       metric_constraints=[], min_sample_size=10000, model_history=False,
       n_concurrent_trials=1, n_jobs=-1, n_splits=5, pred_time_limit=inf,
       preserve_checkpoint=True, retrain_full=True, sample=True,
       skip_transform=False, split_ratio=0.1, ...)

In [None]:
automl = AutoML()
automl.fit(X_train, y_train, task='classification', time_budget=1*60)

In [21]:
automl.model.estimator

ExtraTreesClassifier(max_features=1.0, max_leaf_nodes=80, n_estimators=7,
                     n_jobs=-1)

In [26]:
y_pred = automl.predict(X_test)
y_pred

array([1, 0, 0, 2, 2, 2, 3, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 3,
       2, 2, 2, 2, 2, 0, 3, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 3, 2,
       0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 3, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, 0, 2,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2,
       2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 3, 0, 2, 0, 2, 1, 2, 1, 2, 2, 2, 2, 0, 3, 0, 2, 2, 2, 2, 3,
       2, 0, 2, 2, 1, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 0, 0, 2, 0, 2, 2, 2, 0, 3, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 3, 3,
       2, 2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 0, 0, 2,
       2, 2, 2, 2, 2, 0, 2, 2, 3, 2, 0, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 3, 2, 2, 0, 0, 0, 0, 2, 0, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 3, 2, 2, 1, 0, 2, 0, 0, 0,
       1, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 3, 0,

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94        67
           1       0.94      0.83      0.88        18
           2       1.00      0.98      0.99       244
           3       0.94      0.94      0.94        17

    accuracy                           0.97       346
   macro avg       0.94      0.93      0.94       346
weighted avg       0.97      0.97      0.97       346



In [24]:
with open('./exports/car_evaluation_model.pkl', 'wb') as f:
    pickle.dump(automl, f)