Library

In [None]:
#import library
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.tree import export_graphviz
import graphviz
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

Read Data

In [None]:
#read dataset
df = pd.read_csv("YOUR DATASET FILE NAME")
df

Eksplorasi Data

In [None]:
#cek dimensi data
df.shape

In [None]:
#checking data types
df.dtypes

In [None]:
# Cek Jumlah Kategori Pada Atribut
print(pd.value_counts(df.JK))


In [None]:
#Cek Jumlah kategori pada label
print(pd.value_counts(df.BBU), "\n")
print(pd.value_counts(df.TBU), "\n")
print(pd.value_counts(df.BBTB), "\n")
print(pd.value_counts(df.IMTU))

In [None]:
#cek missing value
df.isnull().sum()

In [None]:
#Cek data duplikat
df.duplicated().sum()

In [None]:
#Cek baris mana yang duplikat
duplicated_rows = df[df.duplicated()]
print("Data Duplikat:")
print(duplicated_rows)

In [None]:
# Visualisasi dengan Box Plot
plt.figure(figsize=(6, 4))
sns.boxplot(df)
plt.title("Box Plot")
plt.show()


Preprocessing Data

In [None]:
#Data Selection
#Delete Atribut IMT dan Label IMT/Umur 
del df['IMT']
del df['IMTU']
df

In [None]:
#Data Cleaning
#Penanganan Data Duplikat
df.drop_duplicates(keep = 'first', inplace = True)

In [None]:
#Cek Data duplikat setelah dilakukan penanganan
df.duplicated().sum()

In [None]:
#mengecek data setelah menghapus data duplikat
df

In [None]:
#Data cleaning
#Penanganan Data Outlier untuk atribut BB
# Calculate Q1 and Q3
#Q1 = df['BB'].quantile(0.25)
#Q3 = df['BB'].quantile(0.75)

# Calculate IQR
#IQR = Q3 - Q1

# Define outlier thresholds
#lower_threshold = Q1 - 1.5 * IQR
#upper_threshold = Q3 + 1.5 * IQR

# Identify outlier values
#outliers = df[(df['BB'] < lower_threshold) | (df['BB'] > upper_threshold)]

# Choose an approach to handle outliers (e.g., replacing with median)
#median_value = df['BB'].median()
#df.loc[outliers.index, 'BB'] = median_value

# Visualisasi dengan Box Plot setelah dilakukan penanganan
#plt.figure(figsize=(6, 4))
#sns.boxplot(df)
#plt.title("Box Plot")
#plt.show()

Data Transformation

In [None]:
#Generalisasi Untuk Atribut Jenis Kelamin (JK) menjadi numerik
map_JK = dict(zip(set(df['JK']), [0,1]))
df['JK'] = df['JK'].apply(lambda x:map_JK[x])
print(map_JK)

In [None]:
#Menampilkan data setelah transformation
df

Pembuatan Model Decision Tree

In [None]:
# Memisahkan label dengan atribut
#X = df.drop(['BBU', 'TBU', 'BBTB'], axis=1)
labels = ['BBU', 'TBU', 'BBTB']
y = df[labels]

In [None]:
#Memisahkan label dari dataset
df.drop(labels, axis=1, inplace=True)

In [None]:
#Pembagian dataset
X_train, X_test, y_train, y_test = train_test_split(df
                                                    ,y
                                                    ,test_size=0.2
                                                    ,shuffle=True
                                                    ,random_state=12)

In [None]:
print("Ukuran x_train :", X_train.shape)
print("Ukuran x_tes :", X_test.shape)
print("Ukuran y_train :", y_train.shape)
print("Ukuran y_tes :",y_test.shape)

In [None]:
# Pembuatan Model Decision Tree
model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)

# Menampilkan aturan (rules)
tree_rules = export_text(model, feature_names=list(X_train.columns))
print("Decision Tree Rules:")
print(tree_rules)

# Menampilkan pohon keputusan
dot_data = export_graphviz(
    model,
    out_file=None,
    feature_names=list(X_train.columns),
    class_names=list(y_train.columns),
    filled=True,
    rounded=True,
    special_characters=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('decision_tree.png')  # Simpan pohon keputusan sebagai gambar


Testing

In [None]:
# Proses Pengujian
y_pred = model.predict(X_test)
print(y_pred.shape)
print(y_test.shape)

In [None]:
prediksi = pd.DataFrame(y_pred)
prediksi.colums = ['hasil prediksi']
prediksi

In [None]:
# Cek Jumlah label pada setiap kolom
for column in prediksi.columns:
    print(f"Jumlah nilai pada kolom {column}:")
    print(prediksi[column].value_counts())
    print("\n")

Evaluasi Model

In [None]:
#Menghitung Confusion Matrix untuk setiap kolom label
for i, column in enumerate(y_test.columns):
    cm = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])
    print(f"Metrics untuk kolom {column}:")
    print("Confusion Matrix:")
    print(cm)

In [None]:
# Menghitung akurasi untuk setiap kolom label
for i, column in enumerate(y_test.columns):
    accuracy = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"Akurasi untuk kolom {column}: {accuracy * 100:.2f}%")

In [None]:
#Menghitung Precision Score untuk setiap kolom label
for i, column in enumerate(y_test.columns):
    precision = precision_score(y_test.iloc[:, i], y_pred[:, i], average='weighted')
    print(f"Precision untuk kolom {column}: {precision * 100:.2f}%")

In [None]:
#Menghitung Recall Score untuk setiap kolom label
for i, column in enumerate(y_test.columns):
    recall = recall_score(y_test.iloc[:, i], y_pred[:, i], average='weighted')
    print(f"Recall Score untuk kolom {column}: {recall * 100:.2f}%")

In [None]:
#Menghitung F-1 Score untuk setiap kolom label
for i, column in enumerate(y_test.columns):
    f1 = f1_score(y_test.iloc[:, i], y_pred[:, i], average='weighted')
    print(f"F-1 Score untuk kolom {column}: {f1 * 100:.2f}%")

In [None]:
# Simpan model decision tree ke dalam format joblib/.h5
model_filename = 'decision_tree_model.h5'
joblib.dump(model, model_filename)

print("Model berhasil disimpan dalam format .h5")
