In [144]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import category_encoders as ce
import warnings

warnings.filterwarnings("ignore")
# load data
categories = ["buying", "main", "doors", "lug_boot", "safety", "class"]
x_cat = categories[1:]
df = pd.read_csv("car.data", names=categories)
# Maintenance = High
# Number of doors = 4
# Lug Boot Size = Big
# Safety = High
# Class Value = Good
section5 = pd.DataFrame.from_dict(
    {"main": [2], "doors": [3], "lug_boot": [3], "safety": [3], "class": [3]}
)

In [182]:
# encode data 
df_temp = df.copy()
df_temp = df_temp[categories]
for cat in categories:
    df_temp[cat] = df_temp[cat].astype("category")
df_temp_cols = df_temp.columns
df_temp[df_temp_cols] = df_temp[df_temp_cols].apply(lambda x: x.cat.codes)

# create result set and feature set
X, y = df_temp.iloc[:, 1:], df_temp.iloc[:, 0]

# create Dmatrix for optimised performance
df_matrix = xgb.DMatrix(data=X, label=y, enable_categorical=True)

# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

# create classifier object
xg_classifier = xgb.XGBClassifier(use_label_encoder=False)

# fit the model and check accuracy
xg_classifier.fit(X_train, y_train)
y_pred = xg_classifier.predict(X_test)

# accuracy score
print(accuracy_score(y_test, y_pred), xg_classifier.predict(section5[x_cat]))



0.09248554913294797 [1]


In [181]:
# create result set and feature set
X = df[x_cat]
y = df["buying"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=69
)
# encode data
encoder = ce.OrdinalEncoder(cols=x_cat)
X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

# max depth of 3 seems to be the best
for i in range(1, 10):
    forest = RandomForestClassifier(max_depth=i, n_estimators=1000, random_state=69)

    forest.fit(X_train, y_train)

    y_pred = forest.predict(X_test)

    print(accuracy_score(y_test, y_pred), forest.predict(section5[x_cat]))

0.26011560693641617 ['low']
0.24855491329479767 ['low']
0.2947976878612717 ['low']
0.25722543352601157 ['low']
0.20809248554913296 ['low']
0.13872832369942195 ['low']
0.13005780346820808 ['low']
0.11849710982658959 ['low']
0.11271676300578035 ['low']


In [148]:
# trying to remove one feature
y = df["buying"]
best_categories = x_cat
best_accuracy = 0
best_tree = None
for i in range(len(x_cat)):
    tempCat = x_cat[:i] + x_cat[i + 1 :]
    X = df[tempCat]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=69
    )

    # encode data
    encoder = ce.OrdinalEncoder(cols=tempCat)
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    tree_gini = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=69)

    tree_gini.fit(X_train, y_train)

    y_pred = tree_gini.predict(X_test)
    tmp_acc = accuracy_score(y_test, y_pred)

    if tmp_acc > best_accuracy:
        best_accuracy = tmp_acc
        best_categories = tempCat
        best_tree = tree_gini

In [166]:
sec5_pred = best_tree.predict(section5[best_categories])
sec5_pred[0]

'low'

In [179]:
from sklearn.model_selection import KFold

kf_df = df[categories]

rn = range(1, 1728)
kf = KFold(n_splits=100, shuffle=False)

tree_gini = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=69)

best_trees = {}

for train_index, test_index in kf.split(rn):
    X_train = kf_df.iloc[train_index].loc[:, x_cat]
    X_test = kf_df.iloc[test_index].loc[:, x_cat]
    y_train = kf_df.iloc[train_index].loc[:, 'buying']
    y_test = kf_df.iloc[test_index].loc[:, 'buying']

    encoder = ce.OrdinalEncoder(cols=x_cat)
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.fit_transform(X_test)

    tree_gini.fit(X_train, y_train)

    best_trees[accuracy_score(y_test, tree_gini.predict(X_test))] = tree_gini

In [180]:
best_accuracy = 0
best_tree = None
for k, v in best_trees.items():
    if k > best_accuracy:
        best_accuracy = k
        best_tree = v
if best_tree:
    print(best_tree.predict(section5[x_cat]), best_accuracy)

['low'] 0.7058823529411765


In [183]:
# train with all data
X = df[x_cat]
encoder = ce.OrdinalEncoder(cols=x_cat)
X = encoder.fit_transform(X)
y = df['buying']
tree_gini = DecisionTreeClassifier(criterion="gini", max_depth=3, random_state=69)
tree_gini.fit(X, y)
tree_gini.predict(section5[x_cat])

array(['low'], dtype=object)