In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('src/gemastik24-data-mining/datasets/breast-cancer.csv') # Ubah sesuai dengan lokasi relatif file breast-cancer.csv
df.head()

Unnamed: 0,Grouping,Age (years),Education,Working status,Marital status,Menarche (years),Menopause,First pregnancy,Parity,Breastfeeding,Highfat,BMI,Ethnicity
0,BC,>= 50,Senior high school,Housewife,Marriage,12 to 13,< 50 years,20-29 years,>= Multiparous,>=12 months,Normal,Obesity,Minangnese
1,Non-BC,>= 50,Vocational degree,Master's student,Single/ widow,>13,< 50 years,>30 years,>= Multiparous,<12 months,High,Normal,Minangnese
2,BC,>= 50,Senior high school,Housewife,Marriage,7 to 11,< 50 years,20-29 years,>= Multiparous,>=12 months,Normal,Normal,Javanese
3,Non-BC,>= 50,Senior high school,Master's student,Marriage,12 to 13,< 50 years,20-29 years,Primiparous,>=12 months,High,Normal,Javanese
4,BC,>= 50,Senior high school,Private servant,Marriage,>13,< 50 years,20-29 years,>= Multiparous,>=12 months,High,Normal,Minangnese


## Klasifikasi dengan Decision Tree

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Mengubah data kategorikal menjadi numerik
label_encoder = LabelEncoder()
for column in df.columns.drop('Grouping'):
    df[column] = label_encoder.fit_transform(df[column])

print(df.head())

# Memisahkan atribut dan label
X = df.drop('Grouping', axis=1)
y = df['Grouping']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print("Jumlah Data Train: {}".format(len(X_train)))
print("Jumlah Data Test: {}".format(len(X_test)))

  Grouping  Age (years)  Education  Working status  Marital status  \
0       BC            1          4               3               0   
1   Non-BC            1          6               4               1   
2       BC            1          4               3               0   
3   Non-BC            1          4               4               0   
4       BC            1          4               5               0   

   Menarche (years)  Menopause  First pregnancy  Parity  Breastfeeding  \
0                 0          0                0       0              1   
1                 2          0                2       0              0   
2                 1          0                0       0              1   
3                 0          0                0       2              1   
4                 2          0                0       0              1   

   Highfat  BMI  Ethnicity  
0        1    1          1  
1        0    0          1  
2        1    0          0  
3        0    0   

In [5]:
# Proses Klasifikasi, dengan semua fitur tanpa tuning parameter
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Tampilkan hasil klasifikasi dengan classification report
print(classification_report(y_test, y_pred))

# Tampilkan akurasi, precision, recall, dan f1 score
print("Akurasi: {}".format(accuracy_score(y_test, y_pred)))
print("Precision: {}".format(precision_score(y_test, y_pred, average='macro')))
print("Recall: {}".format(recall_score(y_test, y_pred, average='macro')))
print("F1 Score: {}".format(f1_score(y_test, y_pred, average='macro')))

              precision    recall  f1-score   support

          BC       0.72      0.73      0.73        45
      Non-BC       0.65      0.63      0.64        35

    accuracy                           0.69        80
   macro avg       0.68      0.68      0.68        80
weighted avg       0.69      0.69      0.69        80

Akurasi: 0.6875
Precision: 0.6822250639386189
Recall: 0.680952380952381
F1 Score: 0.6814779423475075


Top 5 features berdasarkan proses feature selection:
- Highfat: 0.5001669752599721
- Working status: 0.13163254905792693
- Breastfeeding: 0.090600172899282
- First pregnancy: 0.054788830841513654
- BMI: 0.04329839966716594

In [6]:
# Proses Klasifikasi, dengan 5 fitur terbaik tanpa tuning parameter
top_5 = ['Highfat', 'Working status', 'Breastfeeding', 'BMI', 'First pregnancy']
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train[top_5], y_train)
y_pred = clf.predict(X_test[top_5])

print(classification_report(y_test, y_pred))

print("Akurasi: {}".format(accuracy_score(y_test, y_pred)))
print("Precision: {}".format(precision_score(y_test, y_pred, average='macro')))
print("Recall: {}".format(recall_score(y_test, y_pred, average='macro')))
print("F1 Score: {}".format(f1_score(y_test, y_pred, average='macro')))

              precision    recall  f1-score   support

          BC       0.75      0.93      0.83        45
      Non-BC       0.88      0.60      0.71        35

    accuracy                           0.79        80
   macro avg       0.81      0.77      0.77        80
weighted avg       0.80      0.79      0.78        80

Akurasi: 0.7875
Precision: 0.8125
Recall: 0.7666666666666666
F1 Score: 0.7717737875482463


In [7]:
# Lakukan cross validation dengan 10 fold dengan StratifiedKFold
skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
clf = DecisionTreeClassifier(random_state=1)
accuracy_scores = cross_val_score(clf, X_train[top_5], y_train, cv=skf, scoring='accuracy')

# Pilih nilai fold dengan akurasi tertinggi
for i, acc in enumerate(accuracy_scores):
	print("Fold ke-{}: {}".format(i+1, acc))

print("Akurasi tertinggi: {}".format(accuracy_scores.max()))

Fold ke-1: 0.78125
Fold ke-2: 0.75
Fold ke-3: 0.78125
Fold ke-4: 0.84375
Fold ke-5: 0.71875
Fold ke-6: 0.8125
Fold ke-7: 0.75
Fold ke-8: 0.65625
Fold ke-9: 0.9375
Fold ke-10: 0.90625
Akurasi tertinggi: 0.9375


In [8]:
# Lakukan cross validation dengan 10 fold, kemudian pilih fold yang memiliki akurasi tertinggi untuk digunakan sebagai data uji
fold_accuracies = []
fold_indices = []

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

top_5_X_train = X_train[top_5]

# Lakukan 10-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(top_5_X_train, y_train)):
	X_train_fold, X_val_fold = top_5_X_train.iloc[train_index], top_5_X_train.iloc[test_index]
	y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

	clf_kflod = DecisionTreeClassifier(random_state=1)
	clf_kflod.fit(X_train_fold, y_train_fold)

	accuracy = clf_kflod.score(X_val_fold, y_val_fold)
	fold_accuracies.append(accuracy)
	fold_indices.append((train_index, test_index))

# Ambil fold dengan akurasi tertinggi
best_fold_index = fold_accuracies.index(max(fold_accuracies))
best_fold_accuracy = fold_accuracies[best_fold_index]
best_fold_train_index, best_fold_val_index = fold_indices[best_fold_index]

# training ulang model dengan fold terbaik
X_train_best_fold = top_5_X_train.iloc[best_fold_train_index]
y_train_best_fold = y_train.iloc[best_fold_train_index]
X_val_best_fold = top_5_X_train.iloc[best_fold_val_index]
y_val_best_fold = y_train.iloc[best_fold_val_index]

clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train_best_fold, y_train_best_fold)

# Evaluasi model dengan data uji
y_pred = clf.predict(X_test[top_5])
test_accuracy = accuracy_score(y_test, y_pred)

print("Fold terbaik adalah fold ke-{} dengan akurasi {}".format(best_fold_index, best_fold_accuracy))
print("Akurasi pada data uji: {}".format(test_accuracy))

Fold terbaik adalah fold ke-8 dengan akurasi 0.9375
Akurasi pada data uji: 0.775


In [11]:
# Membuat fungsi untuk mekanisme StratifiedKFold di atas supaya memudahkan untuk digunakan kembali
def cross_val_decision_tree(classifier, X_train, y_train, X_test, y_test, top_features, n_splits=10, random_state=1):
	fold_accuracies = []
	fold_indices = []

	kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

	top_X_train = X_train[top_features]

	for fold, (train_index, test_index) in enumerate(kf.split(top_X_train, y_train)):
		X_train_fold, X_val_fold = top_X_train.iloc[train_index], top_X_train.iloc[test_index]
		y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

		classifier.fit(X_train_fold, y_train_fold)

		accuracy = clf_kflod.score(X_val_fold, y_val_fold)
		fold_accuracies.append(accuracy)
		fold_indices.append((train_index, test_index))

	best_fold_index = fold_accuracies.index(max(fold_accuracies))
	best_fold_accuracy = fold_accuracies[best_fold_index]
	best_fold_train_index, best_fold_val_index = fold_indices[best_fold_index]

	X_train_best_fold = top_X_train.iloc[best_fold_train_index]
	y_train_best_fold = y_train.iloc[best_fold_train_index]
	X_val_best_fold = top_X_train.iloc[best_fold_val_index]
	y_val_best_fold = y_train.iloc[best_fold_val_index]

	classifier.fit(X_train_best_fold, y_train_best_fold)

	y_pred = classifier.predict(X_test[top_features])
	test_accuracy = accuracy_score(y_test, y_pred)

	print("Best fold is fold-{} with accuracy {}".format(best_fold_index, best_fold_accuracy))
	print("Test accuracy: {}".format(test_accuracy))

	return clf

Proses di atas menggunakan klasifier Decision Tree dengan parameter default. Pada proses ini, akan dilakukan hyperparameter tuning untuk mencari parameter terbaik yang dapat meningkatkan performa model.

In [10]:
# Membuat klasifier DecisionTreeClassifier dengan parameter yang tidak default
clf = DecisionTreeClassifier(
	criterion='entropy', 
	splitter='best',
	min_samples_split=2,
	max_depth=5,
	class_weight='balanced',
	random_state=42
)

clf.fit(X_train[top_5], y_train)
y_pred = clf.predict(X_test[top_5])

print(classification_report(y_test, y_pred))

print("Akurasi: {}".format(accuracy_score(y_test, y_pred)))
print("Precision: {}".format(precision_score(y_test, y_pred, average='macro')))
print("Recall: {}".format(recall_score(y_test, y_pred, average='macro')))
print("F1 Score: {}".format(f1_score(y_test, y_pred, average='macro')))

              precision    recall  f1-score   support

          BC       0.75      0.96      0.84        45
      Non-BC       0.91      0.60      0.72        35

    accuracy                           0.80        80
   macro avg       0.83      0.78      0.78        80
weighted avg       0.82      0.80      0.79        80

Akurasi: 0.8
Precision: 0.8337147215865751
Recall: 0.7777777777777778
F1 Score: 0.7836375929682218


In [15]:
# Lakukan cross validation dengan 10 fold dengan StratifiedKFold
clf_best = cross_val_decision_tree(clf, X_train, y_train, X_test, y_test, top_5, n_splits=10, random_state=1)

Best fold is fold-8 with accuracy 0.9375
Test accuracy: 0.775


## Klasifikasi dengan Gradient Boosting Tree