### **In this notebook I will try to do an exploratory data analysis and classification for the Car Evaluation dataset, found in the UCI machine learning repository.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns


data = pd.read_csv('../input/car-evaluation-data-set/car_evaluation.csv', header = None)  

In [None]:
data.head()

In [None]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


data.columns = col_names

col_names

In [None]:
data.describe()

In [None]:
data.drop_duplicates()

In [None]:
data.info()

In [None]:
for col in data.columns:
    print(data[col].value_counts())

In [None]:
X = data.drop(['class'], axis=1)
y = data['class']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify=y)

In [None]:
print(y_test.value_counts()) #we secured the 20% in the classes represenation

We will use an ordinal encoder since there is ordinality in our data

In [None]:
mapping = [{'col':'buying', 'mapping':{'low':0, 'med':1, 'high':2, 'vhigh':3}},
          {'col':'maint', 'mapping':{'low':0, 'med':1, 'high':2, 'vhigh':3}},
          {'col':'doors', 'mapping':{'2':0, '3':1, '4':2, '5more':3}},
          {'col':'persons', 'mapping':{'2':0, '4':1, 'more':2}},
          {'col':'lug_boot', 'mapping':{'small':0, 'med':1, 'big':2}},
          {'col':'safety', 'mapping':{'low':0, 'med':1, 'high':2}}]

In [None]:
import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=['byuing', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], mapping = mapping)

X_train = encoder.fit_transform(X_train)
X_test = encoder.fit_transform(X_test)

We will produce synthetic data to balance the classes

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 42)
           
X_train, y_train = sm.fit_sample(X_train, y_train) 

In [None]:
print(y_train.value_counts())

# Decision Tree: GINI criterion

In [None]:
from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier

training_loss = []
test_loss = []

def tree_scores(i):
    clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=i, random_state = 42)
    
    
    clf_gini.fit(X_train, y_train)
    y_pred_gini = clf_gini.predict_proba(X_test)
    y_pred_train_gini = clf_gini.predict_proba(X_train)
    
     

    training_loss.append(log_loss(y_train, y_pred_train_gini))

    test_loss.append(log_loss(y_test, y_pred_gini))
     
        
for i in range(1,11):
    tree_scores(i)

In [None]:
y = [1,2,3,4,5,6,7,8,9,10]

# plotting the line 1 points 
plt.plot(y, training_loss, label = "training loss ")

# plotting the line 2 points 
plt.plot(y, test_loss, label = "test loss ")
plt.xlabel('Max Depth')
# Set the y axis label of the current axis.
plt.ylabel('Log-Loss')
# Set a title of the current axes.
plt.title('Log-Loss plot ')
# show a legend on the plot
plt.legend()
# Display a figure.
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=7, random_state = 42)

print('Cross-Validation Score:',np.mean(cross_val_score(clf_gini, X_train, y_train, cv=10)))

In [None]:

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=7, random_state = 42)

clf_gini.fit(X_train, y_train)

In [None]:
y_pred_gini = clf_gini.predict(X_test)

In [None]:
print('Training set score: {:.4f}'.format(clf_gini.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_gini.score(X_test, y_test)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import graphviz
from sklearn import tree




dot_data = tree.export_graphviz(clf_gini, out_file=None, 
                              feature_names=X_train.columns,  
                              class_names=y_train,  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data) 

graph 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred_gini)

cf_matrix

In [None]:
plt.figure(figsize=(20,10))
ax= plt.subplot()
sns.heatmap(cf_matrix, annot=True, ax = ax)  #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(y_test.unique())
ax.yaxis.set_ticklabels(y_test.unique())

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_gini))

# Decision Tree: ENTROPY criterion

In [None]:
training_loss = []
test_loss = []

def tree_scores(i):
    clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=i, random_state = 42)
    
    
    clf_en.fit(X_train, y_train)
    y_pred_en = clf_en.predict_proba(X_test)
    y_pred_train_en = clf_en.predict_proba(X_train)
    
     

    training_loss.append(log_loss(y_train, y_pred_train_en))


    test_loss.append(log_loss(y_test, y_pred_en))
     
        
for i in range(1,11):
    tree_scores(i)

In [None]:
y = [1,2,3,4,5,6,7,8,9,10]

# plotting the line 1 points 
plt.plot(y, training_loss, label = "training loss ")

# plotting the line 2 points 
plt.plot(y, test_loss, label = "test loss ")
plt.xlabel('Max Depth')
# Set the y axis label of the current axis.
plt.ylabel('Log-Loss')
# Set a title of the current axes.
plt.title('Log-Loss plot ')
# show a legend on the plot
plt.legend()
# Display a figure.
plt.show()

In [None]:
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state = 42)

print('Cross-Validation Score:',np.mean(cross_val_score(clf_en, X_train, y_train, cv=20)))

In [None]:
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state = 42)

clf_en.fit(X_train, y_train)

In [None]:
y_pred_en = clf_en.predict(X_test)

In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_en.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_en.score(X_test, y_test)))

In [None]:
dot_data = tree.export_graphviz(clf_en, out_file=None, 
                              feature_names=X_train.columns,  
                              class_names=y_train,  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data) 

graph 

In [None]:
cm = confusion_matrix(y_test, y_pred_en)

print('Confusion matrix\n\n', cm)

In [None]:
plt.figure(figsize=(20,10))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)  #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(y_test.unique())
ax.yaxis.set_ticklabels(y_test.unique())

In [None]:
print(classification_report(y_test, y_pred_en))

# GRADIENT BOOSTING

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

training_loss = []
test_loss = []

def tree_scores(i):
    clf_gb = GradientBoostingClassifier( max_depth=i, random_state = 42)
    
    
    clf_gb.fit(X_train, y_train.values.ravel())
    y_pred_gb = clf_gb.predict_proba(X_test)
    y_pred_train_gb = clf_gb.predict_proba(X_train)
    
     

    training_loss.append(log_loss(y_train, y_pred_train_gb))


    test_loss.append(log_loss(y_test, y_pred_gb))
     
        
for i in range(1,11):
    tree_scores(i)

In [None]:
y = [1,2,3,4,5,6,7,8,9,10]

# plotting the line 1 points 
plt.plot(y, training_loss, label = "training loss ")

# plotting the line 2 points 
plt.plot(y, test_loss, label = "test loss ")
plt.xlabel('Max Depth')
# Set the y axis label of the current axis.
plt.ylabel('Log-Loss')
# Set a title of the current axes.
plt.title('Log-Loss plot ')
# show a legend on the plot
plt.legend()
# Display a figure.
plt.show()

In [None]:
clf_gb = GradientBoostingClassifier( max_depth=5, random_state = 42)

print('Cross-Validation Score:',np.mean(cross_val_score(clf_gb, X_train, y_train.values.ravel(), cv=10)))

In [None]:
clf_gb = GradientBoostingClassifier( max_depth=5, random_state = 42)

clf_gb.fit(X_train, y_train.values.ravel())

In [None]:
y_pred_gb = clf_gb.predict(X_test)

In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_gb.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_gb.score(X_test, y_test)))

In [None]:
dot_data = tree.export_graphviz(clf_gb.estimators_[0, 0], out_file=None, 
                              feature_names=X_train.columns,  
                              class_names=['1','2','3','4'],  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data) 

graph 

In [None]:
cm = confusion_matrix(y_test, y_pred_gb)

print('Confusion matrix\n\n', cm)

In [None]:
plt.figure(figsize=(20,10))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)  #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(pd.unique(y_test.values.ravel()))
ax.yaxis.set_ticklabels(pd.unique(y_test.values.ravel()))

In [None]:
print(classification_report(y_test, y_pred_gb))

# XGBOOST

In [None]:
import xgboost as xgb

clf = xgb.XGBClassifier(max_depth=2, n_jobs = 4)

In [None]:
training_loss = []
test_loss = []

def tree_scores(i):
    clf_xgb = xgb.XGBClassifier( max_depth=i, random_state = 42, n_jobs = 4)
    
    
    clf_xgb.fit(X_train, y_train.values.ravel())
    y_pred_xgb = clf_xgb.predict_proba(X_test)
    y_pred_train_xgb = clf_xgb.predict_proba(X_train)
    
     

    training_loss.append(log_loss(y_train, y_pred_train_xgb))

    test_loss.append(log_loss(y_test, y_pred_xgb))
     
        
for i in range(1,11):
    tree_scores(i)

In [None]:
y = [1,2,3,4,5,6,7,8,9,10]

# plotting the line 1 points 
plt.plot(y, training_loss, label = "training loss ")

# plotting the line 2 points 
plt.plot(y, test_loss, label = "test loss ")
plt.xlabel('Max Depth')
# Set the y axis label of the current axis.
plt.ylabel('Log-Loss')
# Set a title of the current axes.
plt.title('Log-Loss plot ')
# show a legend on the plot
plt.legend()
# Display a figure.
plt.show()

In [None]:
clf_xgb = xgb.XGBClassifier( max_depth=4, random_state = 42, n_jobs = 4)

print('Cross-Validation Score:',np.mean(cross_val_score(clf_xgb, X_train, y_train.values.ravel(), cv=10)))

In [None]:
clf_xgb = xgb.XGBClassifier( max_depth=4, random_state = 42, n_jobs = 4)

clf_xgb.fit(X_train, y_train.values.ravel())

In [None]:
y_pred_xgb = clf_xgb.predict(X_test)

In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_xgb.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_xgb.score(X_test, y_test)))

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(clf_xgb, num_trees=4, ax=ax)
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred_xgb)

print('Confusion matrix\n\n', cm)

In [None]:
plt.figure(figsize=(20,10))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)  #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(pd.unique(y_test.values.ravel()))
ax.yaxis.set_ticklabels(pd.unique(y_test.values.ravel()))

In [None]:
print(classification_report(y_test, y_pred_xgb))