In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [20]:
df = pd.read_csv('src/gemastik24-data-mining/datasets/breast-cancer.csv') # Ubah sesuai dengan lokasi relatif file breast-cancer.csv
df.head()

Unnamed: 0,Grouping,Age (years),Education,Working status,Marital status,Menarche (years),Menopause,First pregnancy,Parity,Breastfeeding,Highfat,BMI,Ethnicity
0,BC,>= 50,Senior high school,Housewife,Marriage,12 to 13,< 50 years,20-29 years,>= Multiparous,>=12 months,Normal,Obesity,Minangnese
1,Non-BC,>= 50,Vocational degree,Master's student,Single/ widow,>13,< 50 years,>30 years,>= Multiparous,<12 months,High,Normal,Minangnese
2,BC,>= 50,Senior high school,Housewife,Marriage,7 to 11,< 50 years,20-29 years,>= Multiparous,>=12 months,Normal,Normal,Javanese
3,Non-BC,>= 50,Senior high school,Master's student,Marriage,12 to 13,< 50 years,20-29 years,Primiparous,>=12 months,High,Normal,Javanese
4,BC,>= 50,Senior high school,Private servant,Marriage,>13,< 50 years,20-29 years,>= Multiparous,>=12 months,High,Normal,Minangnese


## Klasifikasi dengan Decision Tree

In [28]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [29]:
# Mengubah data kategorikal menjadi numerik
label_encoder = LabelEncoder()
for column in df.columns.drop('Grouping'):
    df[column] = label_encoder.fit_transform(df[column])

print(df.head())

# Memisahkan atribut dan label
X = df.drop('Grouping', axis=1)
y = df['Grouping']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print("Jumlah Data Train: {}".format(len(X_train)))
print("Jumlah Data Test: {}".format(len(X_test)))

  Grouping  Age (years)  Education  Working status  Marital status  \
0       BC            1          4               3               0   
1   Non-BC            1          6               4               1   
2       BC            1          4               3               0   
3   Non-BC            1          4               4               0   
4       BC            1          4               5               0   

   Menarche (years)  Menopause  First pregnancy  Parity  Breastfeeding  \
0                 0          0                0       0              1   
1                 2          0                2       0              0   
2                 1          0                0       0              1   
3                 0          0                0       2              1   
4                 2          0                0       0              1   

   Highfat  BMI  Ethnicity  
0        1    1          1  
1        0    0          1  
2        1    0          0  
3        0    0   

Top 5 features berdasarkan proses feature selection:
- Highfat: 0.28170292293010674
- Working status: 0.15498352704532217
- Education: 0.11428131432539225
- BMI: 0.06910790838998387
- Menarche (years): 0.067373199846954

In [30]:
# Proses Klasifikasi, dengan semua fitur tanpa tuning parameter
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Tampilkan hasil klasifikasi dengan classification report
print(classification_report(y_test, y_pred))

# Tampilkan akurasi, precision, recall, dan f1 score
print("Akurasi: {}".format(accuracy_score(y_test, y_pred)))
print("Precision: {}".format(precision_score(y_test, y_pred, average='macro')))
print("Recall: {}".format(recall_score(y_test, y_pred, average='macro')))
print("F1 Score: {}".format(f1_score(y_test, y_pred, average='macro')))



              precision    recall  f1-score   support

          BC       0.72      0.73      0.73        45
      Non-BC       0.65      0.63      0.64        35

    accuracy                           0.69        80
   macro avg       0.68      0.68      0.68        80
weighted avg       0.69      0.69      0.69        80

Akurasi: 0.6875
Precision: 0.6822250639386189
Recall: 0.680952380952381
F1 Score: 0.6814779423475075


In [32]:
# Proses Klasifikasi, dengan 5 fitur terbaik tanpa tuning parameter
top_5 = ['Highfat', 'Working status', 'Education', 'BMI', 'Menarche (years)']
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train[top_5], y_train)
y_pred = clf.predict(X_test[top_5])

print(classification_report(y_test, y_pred))

print("Akurasi: {}".format(accuracy_score(y_test, y_pred)))
print("Precision: {}".format(precision_score(y_test, y_pred, average='macro')))
print("Recall: {}".format(recall_score(y_test, y_pred, average='macro')))
print("F1 Score: {}".format(f1_score(y_test, y_pred, average='macro')))

              precision    recall  f1-score   support

          BC       0.73      0.89      0.80        45
      Non-BC       0.80      0.57      0.67        35

    accuracy                           0.75        80
   macro avg       0.76      0.73      0.73        80
weighted avg       0.76      0.75      0.74        80

Akurasi: 0.75
Precision: 0.7636363636363637
Recall: 0.7301587301587301
F1 Score: 0.7333333333333334


In [45]:
# Lakukan cross validation dengan 10 fold dengan StratifiedKFold
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
clf = DecisionTreeClassifier(random_state=1)
accuracy_scores = cross_val_score(clf, X_train[top_5], y_train, cv=skf, scoring='accuracy')

# Pilih nilai fold dengan akurasi tertinggi
for i, acc in enumerate(accuracy_scores):
	print("Fold ke-{}: {}".format(i+1, acc))

print("Akurasi tertinggi: {}".format(accuracy_scores.max()))

Fold ke-1: 0.78125
Fold ke-2: 0.65625
Fold ke-3: 0.71875
Fold ke-4: 0.8125
Fold ke-5: 0.65625
Fold ke-6: 0.8125
Fold ke-7: 0.65625
Fold ke-8: 0.53125
Fold ke-9: 0.75
Fold ke-10: 0.78125
Akurasi tertinggi: 0.8125


In [43]:
# Lakukan cross validation dengan 10 fold, kemudian pilih fold yang memiliki akurasi tertinggi untuk digunakan sebagai data uji
fold_accuracies = []
fold_indices = []

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

top_5_X_train = X_train[top_5]

# Lakukan 10-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(top_5_X_train, y_train)):
	X_train_fold, X_val_fold = top_5_X_train.iloc[train_index], top_5_X_train.iloc[test_index]
	y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

	clf_kflod = DecisionTreeClassifier(random_state=1)
	clf_kflod.fit(X_train_fold, y_train_fold)

	accuracy = clf_kflod.score(X_val_fold, y_val_fold)
	fold_accuracies.append(accuracy)
	fold_indices.append((train_index, test_index))

# Ambil fold dengan akurasi tertinggi
best_fold_index = fold_accuracies.index(max(fold_accuracies))
best_fold_accuracy = fold_accuracies[best_fold_index]
best_fold_train_index, best_fold_val_index = fold_indices[best_fold_index]

# training ulang model dengan fold terbaik
X_train_best_fold = top_5_X_train.iloc[best_fold_train_index]
y_train_best_fold = y_train.iloc[best_fold_train_index]
X_val_best_fold = top_5_X_train.iloc[best_fold_val_index]
y_val_best_fold = y_train.iloc[best_fold_val_index]

clf = DecisionTreeClassifier(random_state=1)
clf.fit(X_train_best_fold, y_train_best_fold)

# Evaluasi model dengan data uji
y_pred = clf.predict(X_test[top_5])
test_accuracy = accuracy_score(y_test, y_pred)

print("Fold terbaik adalah fold ke-{} dengan akurasi {}".format(best_fold_index, best_fold_accuracy))
print("Akurasi pada data uji: {}".format(test_accuracy))

Fold terbaik adalah fold ke-3 dengan akurasi 0.8125
Akurasi pada data uji: 0.725


## Klasifikasi dengan Gradient Boosting Tree