In [None]:
# import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [None]:
# import the data
df = pd.read_csv('../input/cars-brand-prediction/cars.csv')
df

In [None]:
df.isnull().sum()

We can see that there are no null values in the dataset.

In [None]:
df.info()

We have our target varaible in a categorical format, we will convert it into numerical format using Labelencoder.

In [None]:
le = LabelEncoder()

In [None]:
df['brand'] = le.fit_transform(df['brand'])
df['brand']

Now we will convert our data into dependent and independent variables.

In [None]:
X = df.drop(['brand'],axis=1)
X

In [None]:
Y = df['brand']
Y

Now we will convert our data into training and testing data.

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=0.3,random_state=99)

In [None]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

Now we will perform a Decision tree Algorithm.

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt.fit(train_x,train_y)

We will check the importance of our features by checking feature importance,we will then select only important features.

In [None]:
features=pd.DataFrame({'Features':train_x.columns,'Importance':np.round(dt.feature_importances_,3)})
features=features.sort_values('Importance',ascending=False)

In [None]:
features

We can see that the cylinders feature is not at all important to us , so we will drop that feature.

In [None]:
X = X.drop(['cylinders'], axis=1)
X

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=0.3,random_state=99)

In [None]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

In [None]:
dt = DecisionTreeClassifier()
dt.fit(train_x,train_y)

In [None]:
y_pred = dt.predict(test_x)
y_pred

In [None]:
results = pd.DataFrame({'Actual':test_y, 'Predicted':y_pred})
results

In [None]:
dt.score(test_x,test_y)

In [None]:
accuracy_score(test_y,y_pred)

We will now visualise how this decision tree works.

In [None]:
from sklearn.tree import export_graphviz
#Saving the dot file as it requires additional dependencies
#This file can be converted into png or jpg using any online converters for business analysis
# graph_data=open('graph.dot','w')
graph_dot = export_graphviz(dt,filled=True,special_characters=True,
                feature_names=list(train_x.columns),class_names=list(le.classes_),rounded=True)
# graph_data.close()

In [None]:
plt.figure(figsize=(8,6))
import graphviz
graph = graphviz.Source(graph_dot)
graph

In [None]:
plt.figure(figsize=(20,20))

from sklearn import tree

tree.plot_tree(dt, feature_names=train_x.columns, class_names=le.classes_, filled=True)