## Libraries

In [None]:
import pandas as pd
import numpy as np
from numpy import asarray
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

from sklearn.utils import compute_class_weight

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 50)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Read data and rename columns

In [None]:
filename='/kaggle/input/car-evaluation-data-set/car_evaluation.csv'
df = pd.read_csv(filename, header=None)

In [None]:
df = df.rename(columns={0:"buying_price", 1:"maintenance_cost", 2:"door_no", 
                        3:"people_no", 4:"lug_boot", 5:"safety", 6:"y"})

In [None]:
df.head()

# EDA 

In [None]:
df.shape

## Null values

In [None]:
df.isnull().sum()

## Initial Stats 

In [None]:
df.describe()

## Check column values

In [None]:
df.buying_price.unique()

In [None]:
df.maintenance_cost.unique()

In [None]:
df.door_no.unique()

In [None]:
df.people_no.unique()

In [None]:
df.y.unique()

In [None]:
df.dtypes

In [None]:
df['door_no']=df['door_no'].replace(['5more'], '5')
df['people_no']=df['people_no'].replace(['more'], '5')

In [None]:
df[["door_no", "people_no"]] = df[["door_no", "people_no"]].apply(pd.to_numeric)

In [None]:
df[["buying_price","maintenance_cost","lug_boot","safety","y"]] = df[["buying_price","maintenance_cost","lug_boot","safety","y"]].astype("str")

In [None]:
df.head()

## Univariate analysis

In [None]:
def plot(column):
    x=df[column].unique()
    y=df[column].value_counts()
    plt.bar(x,y)
    plt.ylabel('Distribution by ' + column)
    return plt.show()   

In [None]:
plot('y')

In [None]:
plot('buying_price')

In [None]:
plot('maintenance_cost')

In [None]:
plot('door_no')

In [None]:
plot('people_no')

In [None]:
plot('safety')

In [None]:
sns.pairplot(df)

## Bivariate Analysis

In [None]:
def grp_brplt(col1):
    
    df1 = df.groupby(['y',col1]).size().to_frame('total').reset_index()
    
    plt.figure(figsize=(10,8))
    ax=plt.subplot()
    ax = sns.barplot(data=df1, x=df1[col1], y=df1["total"], hue=df1["y"])
    
    for p in ax.patches:
        ax.annotate(format(p.get_height(), '.1f'), 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha = 'center', va = 'center', 
                       xytext = (0, 9), 
                       textcoords = 'offset points')

    ax.set_title('Distribution of ' +col1+ ' per target variable', fontsize=20)
    ax.legend(loc='center right', bbox_to_anchor=(1.25, 0.5), ncol=1, title='y')
    return ax 

In [None]:
grp_brplt("maintenance_cost")

In [None]:
grp_brplt("buying_price")

In [None]:
grp_brplt("door_no")

In [None]:
grp_brplt("people_no")

In [None]:
grp_brplt("lug_boot")

In [None]:
grp_brplt("safety")

# Data transformation 

## Ordinal encoding 

In [None]:
df['y1'] = np.where(df['y'].isin(['acc','good', 'vgood']) ,'acc', 'unacc')

This code determines the category vars to declare the order in which the ordinal encoding should assigns values to the vars 
Because the values are different across the variables there were 2 categories declared 
While because the target features doesn't matter on which order is transformed I left it 
The if statement within the loop is built to determine which features correspond to the category

In [None]:
category_lmh = asarray(['low', 'med', 'high', 'vhigh']) 
category_smb= asarray(['small', 'med', 'big']) 

inputlist=["buying_price", "maintenance_cost", "lug_boot", "safety", "y", "y1"]

outputlist= []
for column in inputlist:
        output = column+'_ordenc'
        outputlist.append(output)
        
        if column=="buying_price" or column=="maintenance_cost" or column=="safety": 
           
            enc = preprocessing.OrdinalEncoder(categories=[category_lmh])
            df[output]= enc.fit_transform(df[[column]])
            
        elif column=="lug_boot":
               
            enc = preprocessing.OrdinalEncoder(categories=[category_smb])
            df[output]= enc.fit_transform(df[[column]])
        else:
           
            enc = preprocessing.OrdinalEncoder()
            df[output]= enc.fit_transform(df[[column]])
        

In [None]:
df

## Target encoding 

In [None]:
def trgenc(column):
    means_y1= df.groupby(column)['y1_ordenc'].mean()
    means_y= df.groupby(column)['y_ordenc'].mean()
    
#   column_trg_bin_enc when the target is binary  
    column_output_y1 = column+'_trg_bin_enc'
#   column_trg_multi_enc when the target is multi-class 
    column_output_y = column+'_trg_multi_enc'
    
    df[column_output_y1] = df[column].map(means_y1)
    df[column_output_y] = df[column].map(means_y)
    
    return 

In [None]:
targetenccol=["buying_price", "maintenance_cost", "door_no", "people_no" ,"lug_boot", "safety"]

for column in targetenccol:
        trgenc(column)

# Classification

## Random Forest

In [None]:
def rf (X, y, n_estimators, max_depth, min_samples_split, random_state, class_weight):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    rf = RandomForestClassifier(n_estimators= n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=random_state, class_weight=class_weight)
    rf.fit(X_train,y_train)

    y_pred=rf.predict(X_test)
    
    #feature importances     
    feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
    
    #Classification Report in df 
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['precision'], ascending=True)
    
    #Confusion matrix in df 
    cm = confusion_matrix(y_test, y_pred)
    cmdf = pd.DataFrame(cm)
    
    return (cmdf, df_classification_report, feature_importances)

In [None]:
X = df[['buying_price_trg_bin_enc','maintenance_cost_trg_bin_enc', 
        'door_no_trg_bin_enc', 'people_no_trg_bin_enc',
        'lug_boot_trg_bin_enc', 'safety_trg_bin_enc']].copy()

y = df['y1_ordenc']

rf (X, y, 12, 12, 300, 0, None)

In [None]:
rf (X, y, 40, 3, 350, 42, None)

##  Optimize Random Forest based on feature importance 

In [None]:
X = df[['people_no_trg_bin_enc', 'safety_trg_bin_enc', 'maintenance_cost_trg_bin_enc']].copy()
y = df['y1_ordenc']
rf (X, y, 40, 3, 350, 42, None)

##  Random Forest on 4 values target  

In [None]:
set(df['y_ordenc'])

In [None]:
X = df[['buying_price_trg_multi_enc', 'maintenance_cost_trg_multi_enc', 'safety_trg_multi_enc', 
        'people_no_trg_multi_enc', 'door_no_trg_multi_enc', 'lug_boot_trg_multi_enc']].copy()

y = df['y_ordenc']
rf (X, y, 50, 3, 350, 42, None)

##  Random forest on 4 values target based on feature importance

In [None]:
X = df[['people_no_trg_multi_enc', 'safety_trg_multi_enc']].copy()
y = df['y_ordenc']
rf (X, y, 50, 3, 146, 0, None)

#  Decision Tree

In [None]:
def treeclf (X, y, random_state, min_samples_split, max_depth, class_weight):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf = tree.DecisionTreeClassifier(random_state=random_state, min_samples_split=min_samples_split, max_depth=max_depth, class_weight=class_weight)
    clf = clf.fit(X_train, y_train)

#     plt.figure(figsize=(10,20))
#     tree.plot_tree(clf, fontsize=10, feature_names=X.columns) 
    
    y_pred=clf.predict(X_test)
    
    metrics.confusion_matrix(y_test,y_pred)
    
    feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
    
     #Classification Report in df 
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['precision'], ascending=True)
    
    #Confusion matrix in df 
    cm = confusion_matrix(y_test, y_pred)
    cmdf = pd.DataFrame(cm)
    
    
    return ( cmdf, df_classification_report, feature_importances)

##  DT on 4 values target

In [None]:
X = df[['buying_price_trg_multi_enc', 'maintenance_cost_trg_multi_enc', 'safety_trg_multi_enc', 
        'people_no_trg_multi_enc', 'door_no_trg_multi_enc', 'lug_boot_trg_multi_enc']].copy()

y = df['y_ordenc']
treeclf (X, y, 42, 70, 20, None)

In [None]:
treeclf (X, y, 42, 80, 20, None)

##  DT with feature importance on 4 values target

In [None]:
X = df[['buying_price_trg_multi_enc', 'safety_trg_multi_enc','people_no_trg_multi_enc']].copy()
y = df['y_ordenc']
treeclf (X, y, 42, 70, 40, None)

In [None]:
treeclf (X, y, 42, 170, 40, None)

Based on the results above between Random Forest and Decision Trees - I can conclude that looking at the recall the decision trees performs better than the Random Forest without implementing adjustment to the imbalance problem. 
The best results ae performed with a decision trees random_seed=42, samples_min=70, trees=20.

From this decision trees I am now going to go back to the dataset and solve for the imbalance problem and check how the decision tree model perfoms

# Imbalance Problem 

## Find Class weights 

In [None]:
np.unique(df['y_ordenc'])

In [None]:
classWeights = compute_class_weight('balanced',np.unique(df['y_ordenc']), np.array(df['y_ordenc']))
classWeights

In [None]:
# Transforming the class weight array into a dictionary to pass it in function

class_weight = {0: 1.125, 1:6.26086957, 2:0.35702479, 3:6.64615385}

## Decision Trees with Class Weights

In [None]:
X = df[['buying_price_trg_multi_enc', 'maintenance_cost_trg_multi_enc', 'safety_trg_multi_enc', 
        'people_no_trg_multi_enc', 'door_no_trg_multi_enc', 'lug_boot_trg_multi_enc']].copy()

y = df['y_ordenc']
treeclf (X, y, 42, 100, 20, class_weight)

In [None]:
treeclf (X, y, 42, 100, 20, None)

## Random Forest with Class Weights 

In [None]:
X = df[['buying_price_trg_multi_enc', 'maintenance_cost_trg_multi_enc', 'safety_trg_multi_enc', 
        'people_no_trg_multi_enc', 'door_no_trg_multi_enc', 'lug_boot_trg_multi_enc']].copy()

y = df['y_ordenc']
rf (X, y, 50, 3, 350, 42, class_weight)

Conclusion: Overall the Random Forest with class weight performes slightly better than the Decision Trees wit Class weight. Reason being is the recall for the class = 0.

# Logistic Regression for Multi Class with OnevsRest

In [None]:
def lgovr (X, y, random_state, solver, max_iter, multi_class, class_weight):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    md = LogisticRegression(random_state=random_state, solver= solver, max_iter=max_iter, multi_class=multi_class, class_weight=class_weight)
    md.fit(X_train,y_train)

    y_pred=md.predict(X_test)
    
    
    #Classification Report in df 
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['precision'], ascending=True)
    
    #Confusion matrix in df 
    cm = confusion_matrix(y_test, y_pred)
    cmdf = pd.DataFrame(cm)
    
    return (cmdf, df_classification_report)

In [None]:
X = df[['buying_price_trg_multi_enc', 'maintenance_cost_trg_multi_enc', 'safety_trg_multi_enc', 
        'people_no_trg_multi_enc', 'door_no_trg_multi_enc', 'lug_boot_trg_multi_enc']].copy()

y = df['y_ordenc']
lgovr (X, y, 14, 'saga', 200, 'ovr', class_weight)

In [None]:
def lgovr (X, y, multi_class, class_weight):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    md = LogisticRegression(multi_class=multi_class, class_weight=class_weight)
    md.fit(X_train,y_train)

    y_pred=md.predict(X_test)
    
    
    #Classification Report in df 
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['precision'], ascending=True)
    
    #Confusion matrix in df 
    cm = confusion_matrix(y_test, y_pred)
    cmdf = pd.DataFrame(cm)
    
    return (cmdf, df_classification_report)

In [None]:
lgovr (X, y,'ovr', class_weight)