# Classification and Regression Trees (CART) 

Classification and Regression Trees (CART) adalah salah satu metode nonparametrik yang dapat menggambarkan hubungan antara variabel independen terhadap variabel dependennya.  CART terdiri dari dua jenis analisis, yaitu pohon klasifikasi, dan pohon regresi. Metode ini dapat diterapkan pada data dengan variabel dan ukuran objek yang besar. Oleh karena itu, metode CART didefinisikan dalam bentuk seperti pohon. Jika variabel terikatnya kontinu, maka pohon yang dihasilkan disebut pohon regresi (regression tree). 

# Prosedur CART

# Library

In [70]:
import pandas as pd
import numpy as n

from sklearn.model_selection import train_test_split #Membagi data training dan data testing
from sklearn.metrics import classification_report # untuk evaluasi model 
from sklearn import tree #Untuk model decision tree 

import plotly.express as px  # Untuk Visualisasi Data
import plotly.graph_objects as go # Untuk Visualisasi Data
import graphviz # untuk menggambarkan decision tree graph

# Impor Data

Data yang digunakan adalah data Wine Quality Dataset yang diambil dari dataset kaggle

In [72]:
df=pd.read_csv('WineQT.csv')#membaca data
df#Menampilkan Data

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


In [73]:
df=df[pd.isnull(df['quality'])==False]#Membuat catatan di mana target quality=NaN
df=df.fillna(df.mean())#mengganti nilai missing menjadi rata-rata
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


# Fungsi Klasifikasi CART

In [74]:
def fitting(X, y, criterion, splitter, mdepth, clweight, minleaf):

    # membagi data training dan data testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # membuat model
    model = tree.DecisionTreeClassifier(criterion=criterion, 
                                        splitter=splitter, 
                                        max_depth=mdepth,
                                        class_weight=clweight,
                                        min_samples_leaf=minleaf, 
                                        random_state=0, 
                                  )
    clf = model.fit(X_train, y_train)

    # menentukan klas pada data training
    pred_labels_tr = model.predict(X_train)
    # menentukan klas pada data training
    pred_labels_te = model.predict(X_test)

    # Summary Pohon dan Evaluasi Model
    print('-----------------Summary Pohon -----------------')
    print('Kelas: ', clf.classes_)
    print('Kedalaman pohon: ', clf.tree_.max_depth)
    print('Jumlah Daun: ', clf.tree_.n_leaves)
    print('Jumlah Fitur: ', clf.n_features_in_)
    print('--------------------------------------------------------')
    print("")
    
    print('---------------- Evaluasi dengan Data Testing----------------')
    score_te = model.score(X_test, y_test)
    print('Akurasi: ', score_te)
    # Hasil klasifikasi untuk mengevaluasi model
    print(classification_report(y_test, pred_labels_te))
    print('--------------------------------------------------------')
    print("")
    
    print('---------------- Evaluasi dengan Data Training ----------------')
    score_tr = model.score(X_train, y_train)
    print('Akurasi: ', score_tr)
    # Hasil klasifikasi untuk mengevaluasi model
    print(classification_report(y_train, pred_labels_tr))
    print('--------------------------------------------------------')
    
    # menampilkan plot pohon
    dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=X.columns, 
                                class_names=[str(list(clf.classes_)[0]), str(list(clf.classes_)[1])],
                                filled=True, 
                                rounded=True, 
                               ) 
    graph = graphviz.Source(dot_data)
    
    # Return data yang relevan untuk plotting grafik
    return X_train, X_test, y_train, y_test, clf, graph

# Melakukan Klasifikasi CART 

In [81]:
# Select data for modeling
X=df[['fixed_acidity','volatile_acidity','citric_acid','residual_sugar','chlorides','free_sulfur_dioxide','total_sulfur_dioxide','density','pH','sulphates','alcohol','Id']]
y=df['quality'].values

X_train, X_test, y_train, y_test, clf, graph = fitting(X, y, 'gini', 'best', 
                                                       mdepth=3, 
                                                       clweight=None, 
                                                       minleaf=1000)

# Plot the tree graph
graph

-----------------Summary Pohon -----------------
Kelas:  [3 4 5 6 7 8]
Kedalaman pohon:  0
Jumlah Daun:  1
Jumlah Fitur:  12
--------------------------------------------------------

---------------- Evaluasi dengan Data Testing----------------
Akurasi:  0.4366812227074236
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         7
           5       0.44      1.00      0.61       100
           6       0.00      0.00      0.00        92
           7       0.00      0.00      0.00        27
           8       0.00      0.00      0.00         2

    accuracy                           0.44       229
   macro avg       0.07      0.17      0.10       229
weighted avg       0.19      0.44      0.27       229

--------------------------------------------------------

---------------- Evaluasi dengan Data Training ----------------
Akurasi:  0.4190371991247265
              precision    recal

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


IndexError: list index out of range