## Decision Tree and Random Forest Projects

In [1]:
#Import main libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

We will use the **Car Evaluation Data Set** from Kaggle: https://www.kaggle.com/datasets/elikplim/car-evaluation-data-set

In [2]:
#Load Dataset
df_car = pd.read_csv("../data/raw/car_evaluation.csv", header=None)

### Exploratory Analysis

In [3]:
## Visualize data
df_car.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [4]:
## Look for the shape
df_car.shape

(1728, 7)

In [5]:
## Change header names
col_names=["buying","maint","doors","persons","lug_boot","safety","class"]
df_car.columns = col_names

In [6]:
## Visualize data types
df_car.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

In [7]:
## Explore targeting value
df_car['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [8]:
#Verify missing values
df_car.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

### Data Processing

In [9]:
#Define X and y
X = df_car.drop(['class'], axis = 1)
y = df_car['class']

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#30% for testing
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)

### Model Training with Decision Tree

In [None]:
## Convert our categorical data types
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols = ['buying', 'maint', 'doors', 'persons','lug_boot', 'safety'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

In [13]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1178,1,1,1,1,1,1
585,2,2,2,2,2,2
1552,3,1,2,1,3,3
1169,1,1,1,3,1,1
1033,1,2,3,3,1,3


In [14]:
#Import decision tree
from sklearn.tree import DecisionTreeClassifier

#Model creation
tree = DecisionTreeClassifier(max_depth=2, random_state = 0)

In [15]:
tree.fit(X_train, y_train)

In [16]:
# Predictions
y_train_pred_tree = tree.predict(X_train)
y_test_pred_tree = tree.predict(X_test)

#### Evaluate tree model

In [18]:
from sklearn.metrics import accuracy_score

#Accuracy for train
train_accuracy_tree = accuracy_score(y_train, y_train_pred_tree)

#Accuracy for test
test_accuracy_tree = accuracy_score(y_test, y_test_pred_tree)

print('El accuracy en train es:', train_accuracy_tree)
print('El accuracy en test es:', test_accuracy_tree)

El accuracy en train es: 0.7733664185277088
El accuracy en test es: 0.7591522157996147


In [25]:
import seaborn as sns

importances = tree.feature_importances_
columns = X.columns
sns.barplot(importances,columns ,saturation = 2.0, edgecolor ='black', linewidth = 2)
plt.title('Importancia de cada Feature')
plt.show()

TypeError: barplot() takes from 0 to 1 positional arguments but 2 positional arguments (and 1 keyword-only argument) were given

### Model Training with Randon forest