In [1]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_curve, roc_auc_score, confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv('data/car_evaluation.csv', header=None, names=['buying','maintenance','doors','persons','lug_boot','safety','decision'])
df.head()

Unnamed: 0,buying,maintenance,doors,persons,lug_boot,safety,decision
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   buying       1728 non-null   object
 1   maintenance  1728 non-null   object
 2   doors        1728 non-null   object
 3   persons      1728 non-null   object
 4   lug_boot     1728 non-null   object
 5   safety       1728 non-null   object
 6   decision     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [28]:
df['buying'].value_counts()

med      432
low      432
vhigh    432
high     432
Name: buying, dtype: int64

In [29]:
df['decision'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: decision, dtype: int64

In [60]:
df['persons'].value_counts()

more    576
2       576
4       576
Name: persons, dtype: int64

In [63]:
df['doors'].value_counts()

3        432
5more    432
2        432
4        432
Name: doors, dtype: int64

In [61]:
df['persons']=df['persons'].replace({'more':6})

In [64]:
df['doors']=df['doors'].replace({'5more':5})

In [30]:
get_d=pd.get_dummies(df[['buying','maintenance','lug_boot','safety']],drop_first=True)

In [31]:
df = pd.concat([df,get_d], axis=1)

In [65]:
df

Unnamed: 0,doors,persons,decision,buying_low,buying_med,buying_vhigh,maintenance_low,maintenance_med,maintenance_vhigh,lug_boot_med,lug_boot_small,safety_low,safety_med
0,2,2,unacc,0,0,1,0,0,1,0,1,1,0
1,2,2,unacc,0,0,1,0,0,1,0,1,0,1
2,2,2,unacc,0,0,1,0,0,1,0,1,0,0
3,2,2,unacc,0,0,1,0,0,1,1,0,1,0
4,2,2,unacc,0,0,1,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,5,6,good,1,0,0,1,0,0,1,0,0,1
1724,5,6,vgood,1,0,0,1,0,0,1,0,0,0
1725,5,6,unacc,1,0,0,1,0,0,0,0,1,0
1726,5,6,good,1,0,0,1,0,0,0,0,0,1


In [33]:
df = df.drop(['buying','maintenance','lug_boot','safety'], axis=1)

In [35]:
from sklearn.preprocessing import LabelEncoder
y=df['decision']
label_encoder = LabelEncoder().fit(y)
y = label_encoder.transform(y)

In [42]:
np.unique(y,return_counts=True )

(array([0, 1, 2, 3]), array([ 384,   69, 1210,   65], dtype=int64))

In [68]:
X=df.drop('decision', axis=1)

In [71]:
from sklearn.preprocessing import LabelBinarizer

In [72]:
class DecisionTree_Classifier:
    def __init__(self):
        self.DTC = DecisionTreeClassifier()
    
    def get_params_for_dtc(self, train_x, train_y):
        try:
            self.param_grid_for_dtc={"criterion":['gini','entropy'], 'max_depth':range(1,10),
                                     "min_samples_split": range(1,10), "min_samples_leaf":range(1,5),
                                     "max_features":['auto', 'sqrt','log2']}
            self.str_kfold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
            self.grid = GridSearchCV(estimator=self.DTC, param_grid=self.param_grid_for_dtc, verbose=0, cv=self.str_kfold_cv)
            self.grid.fit(train_x,train_y)
            
            self.criterions = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.min_samples_split = self.grid.best_params_['min_samples_split']
            self.min_samples_leaf = self.grid.best_params_['min_samples_leaf']
            self.max_features = self.grid.best_params_['max_features']
            
            self.dtc = DecisionTreeClassifier(criterion=self.criterions, max_depth=self.max_depth, min_samples_split=self.min_samples_split,
                                              min_samples_leaf = self.min_samples_leaf, max_features=self.max_features)
            self.dtc.fit(train_x,train_y)
            print(f'DecisionTreeClassifier best params:{str(self.grid.best_params_)}.')
            return self.dtc
       
        except Exception as e:
            print(f'Exception occured in get_params_for_dtc method. Exception message: {str(e)}')
            print('LogisticReg Parameter tuning  failed.')
            
    def dtc_model(self, train_x, test_x, train_y, test_y):
        try:
            self.dtc = self.get_params_for_dtc(train_x, train_y)
            self.predict_dtc = self.dtc.predict(test_x)
            self.predict_prob_dtc = self.dtc.predict_proba(test_x)#[:,1]
#             print(self.predict_prob_dtc)
            self.conf_mat = confusion_matrix(test_y, self.predict_dtc)
            print(f"Confusion matrix: \n {self.conf_mat}")
            print(classification_report(test_y,self.predict_dtc))
            #if len(test_y.unique()) == 1: #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
            self.dtc_score = accuracy_score(test_y, self.predict_dtc)
            print(f'Accuracy for DecisionTreeClassifier:{str(self.dtc_score)}')
            #else:
            
            lb = LabelBinarizer()
            lb.fit(test_y)
            y_test = lb.transform(test_y)
            y_pred = lb.transform(self.predict_dtc)
            self.dtc_AUC_score = roc_auc_score(y_test, y_pred, average='macro')
#             auc={}
#             auc += roc_auc_score(test_y, self.predict_prob_dtc,  multi_class='ovo')
            print(f'AUC for DecisionTreeClassifier: {self.dtc_AUC_score}')
#             fpr , tpr , thresholds   = roc_curve(test_y, self.predict_dtc)
#             auc_score = auc(fpr,tpr) same as roc_auc_score for each class
            return self.dtc
        except Exception as e:
            print(f'Exception occured in dtc_model methods. Exception message: {str(e)}')
            raise Exception()            

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X.values,y, test_size=0.2, random_state=123)
dtc = DecisionTree_Classifier()
dtc.dtc_model(X_train, X_test, y_train, y_test)

DecisionTreeClassifier best params:{'criterion': 'gini', 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2}.
Confusion matrix: 
 [[ 57   3  17   7]
 [  5   7   0   1]
 [ 26   0 211   0]
 [  8   1   0   3]]
              precision    recall  f1-score   support

           0       0.59      0.68      0.63        84
           1       0.64      0.54      0.58        13
           2       0.93      0.89      0.91       237
           3       0.27      0.25      0.26        12

    accuracy                           0.80       346
   macro avg       0.61      0.59      0.60       346
weighted avg       0.81      0.80      0.81       346

Accuracy for DecisionTreeClassifier:0.8034682080924855
AUC for DecisionTreeClassifier: 0.7520682441597508


DecisionTreeClassifier(max_depth=9, max_features='sqrt', min_samples_leaf=2)

In [84]:
class RandomForest_Classifier:
    def __init__(self):
        self.RFC = RandomForestClassifier()
    
    def get_params_for_rfc(self, train_x, train_y):
        try:
            self.param_grid_for_rfc={"n_estimators": [10, 50, 100, 130], "criterion": ['gini', 'entropy'],
                                     "max_depth": range(10, 15, 15), "max_features": ['auto', 'log2']}
            
            #self.str_kfold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
            self.grid = GridSearchCV(estimator=self.RFC, param_grid=self.param_grid_for_rfc, verbose=0, cv=5)
            self.grid.fit(train_x,train_y)
            
            self.criterion = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.max_features = self.grid.best_params_['max_features']
            self.n_estimators = self.grid.best_params_['n_estimators']
            
            self.rfc = RandomForestClassifier(criterion=self.criterion, max_depth=self.max_depth, n_estimators=self.n_estimators,
                                              max_features=self.max_features)
            self.rfc.fit(train_x,train_y)
            print(f'RandomForestClassifier best params:{str(self.grid.best_params_)}.')
            return self.rfc
       
        except Exception as e:
            print(f'Exception occured in get_params_for_dtc method. Exception message: {str(e)}')
            print('RandomForestClassifier Parameter tuning  failed.')
            
    def rfc_model(self, train_x, test_x, train_y, test_y):
        try:
            self.rfc = self.get_params_for_rfc(train_x, train_y)
            self.predict_rfc = self.rfc.predict(test_x)
            self.predict_prob_rfc = self.rfc.predict_proba(test_x)#[:,1]
#             print(self.predict_prob_rfc)
            self.conf_mat = confusion_matrix(test_y, self.predict_rfc)
            print(f"Confusion matrix: \n {self.conf_mat}")
            print(classification_report(test_y,self.predict_rfc))
            #if len(test_y.unique()) == 1: #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
            self.rfc_score = accuracy_score(test_y, self.predict_rfc)
            print(f'Accuracy for RandomForest_Classifier:{str(self.rfc_score)}')
            #else:
            
            lb = LabelBinarizer()
            lb.fit(test_y)
            y_test = lb.transform(test_y)
            y_pred = lb.transform(self.predict_rfc)
            self.rfc_AUC_score = roc_auc_score(y_test, y_pred, average='macro')
#             auc={}
#             auc += roc_auc_score(test_y, self.predict_prob_rfc,  multi_class='ovo')
            print(f'AUC for RandomForest_Classifier: {self.rfc_AUC_score}')
#             fpr , tpr , thresholds   = roc_curve(test_y, self.predict_rfc)
#             auc_score = auc(fpr,tpr) same as roc_auc_score for each class
            return self.rfc
        except Exception as e:
            print(f'Exception occured in rfc_model methods. Exception message: {str(e)}')
            raise Exception()            

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X.values,y, test_size=0.2, random_state=123)
rfc = RandomForest_Classifier()
rfc.rfc_model(X_train, X_test, y_train, y_test)

RandomForestClassifier best params:{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 100}.
Confusion matrix: 
 [[ 79   1   3   1]
 [  0  11   0   2]
 [  7   1 229   0]
 [  0   1   0  11]]
              precision    recall  f1-score   support

           0       0.92      0.94      0.93        84
           1       0.79      0.85      0.81        13
           2       0.99      0.97      0.98       237
           3       0.79      0.92      0.85        12

    accuracy                           0.95       346
   macro avg       0.87      0.92      0.89       346
weighted avg       0.96      0.95      0.95       346

Accuracy for RandomForest_Classifier:0.953757225433526
AUC for RandomForest_Classifier: 0.9496637363832778


RandomForestClassifier(criterion='entropy', max_depth=10, max_features='log2')