In [1]:
#include libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#load data and checking contents
data = pd.read_csv("car.data.csv")
print(data)

#check for null values
print("\nChecking NULL values:\n",data.isnull().sum())


     buying  maint  doors persons lug_boot safety  class
0     vhigh  vhigh      2       2    small    low  unacc
1     vhigh  vhigh      2       2    small    med  unacc
2     vhigh  vhigh      2       2    small   high  unacc
3     vhigh  vhigh      2       2      med    low  unacc
4     vhigh  vhigh      2       2      med    med  unacc
...     ...    ...    ...     ...      ...    ...    ...
1723    low    low  5more    more      med    med   good
1724    low    low  5more    more      med   high  vgood
1725    low    low  5more    more      big    low  unacc
1726    low    low  5more    more      big    med   good
1727    low    low  5more    more      big   high  vgood

[1728 rows x 7 columns]

Checking NULL values:
 buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64


In [2]:
#preprocessing data
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("\n")

#label encoding features
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

for col in categorical_cols:
  data[col] = labelEncoder.fit_transform(data[col])

print(data)

Categorical Columns: ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


      buying  maint  doors  persons  lug_boot  safety  class
0          3      3      0        0         2       1      2
1          3      3      0        0         2       2      2
2          3      3      0        0         2       0      2
3          3      3      0        0         1       1      2
4          3      3      0        0         1       2      2
...      ...    ...    ...      ...       ...     ...    ...
1723       1      1      3        2         1       2      1
1724       1      1      3        2         1       0      3
1725       1      1      3        2         0       1      2
1726       1      1      3        2         0       2      1
1727       1      1      3        2         0       0      3

[1728 rows x 7 columns]


In [3]:
#selecting target variable and features
X = data.drop('class', axis = 1)
y = data['class']

#splitting data for training, validating and testing
X_train = X.iloc[:1209]
X_validation = X.iloc[1209:1468]
X_test = X.iloc[1468:]
print(X_validation)
y_train = y.iloc[:1209]
y_validation = y.iloc[1209:1468]
y_test = y.iloc[1468:]

      buying  maint  doors  persons  lug_boot  safety
1209       2      1      0        2         1       1
1210       2      1      0        2         1       2
1211       2      1      0        2         1       0
1212       2      1      0        2         0       1
1213       2      1      0        2         0       2
...      ...    ...    ...      ...       ...     ...
1463       1      0      2        0         1       0
1464       1      0      2        0         0       1
1465       1      0      2        0         0       2
1466       1      0      2        0         0       0
1467       1      0      2        1         2       1

[259 rows x 6 columns]


In [13]:
#implementing decision tree
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=42)

regressor.fit(X_train, y_train)

# Make predictions on the testing set
#y_pred = regressor.predict(X_test)


############################################

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

# Instantiate GridSearchCV with your model and parameter grid
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the validation data
grid_search.fit(X_validation, y_validation)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

########################################################

y_pred = best_model.predict(X_test)
y_pred_reshaped = y_pred.reshape(-1,1)
y_pred_final = y_pred_reshaped.astype(int)


# Evaluate the model's performance on the testing set
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

# mse = mean_squared_error(y_test, y_pred)
# print("Mean Squared Error on Testing Set:", mse)

acc = accuracy_score(y_test, y_pred_final)
print("Accuracy on Testing Set:", acc)
# print(y_pred.shape)
# print(y_test.shape)
# print(y_pred)
# print(y_test)

Accuracy on Testing Set: 0.7384615384615385
