In [None]:
import optuna
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
df = pd.read_csv('src/gemastik24-data-mining/datasets/breast-cancer.csv') # Ubah sesuai dengan lokasi relatif file breast-cancer.csv
df.head()

In [None]:
# Mengubah data kategorikal menjadi numerik
label_encoder = LabelEncoder()
for column in df.columns.drop('Grouping'):
    df[column] = label_encoder.fit_transform(df[column])

print(df.head())

# Memisahkan atribut dan label
X = df.drop('Grouping', axis=1)
y = df['Grouping']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print("Jumlah Data Train: {}".format(len(X_train)))
print("Jumlah Data Test: {}".format(len(X_test)))

In [None]:
top_5 = ['Highfat', 'Working status', 'Breastfeeding', 'BMI', 'First pregnancy']

def objective(trial):
	classifier_name = trial.suggest_categorical(name='classifier', choices=['DecisionTree', 'RandomForest'])
	if classifier_name == 'DecisionTree':
		criterion = trial.suggest_categorical(name='criterion', choices=['gini', 'entropy', 'log_loss'])
		splitter = trial.suggest_categorical(name='splitter', choices=['best', 'random'])
		min_samples_split = trial.suggest_int(name='min_samples_split', low=2, high=10, step=2)
		max_depth = trial.suggest_int(name='max_depth', low=2, high=10, step=2)
		clf = DecisionTreeClassifier(
			criterion=criterion,
			splitter=splitter,
			min_samples_split=min_samples_split,
			max_depth=max_depth
		)
	else:
		n_estimators = trial.suggest_int(name='n_estimators', low=100, high=150, step=10)
		criterion = trial.suggest_categorical(name='criterion', choices=['gini', 'entropy', 'log_loss'])
		min_samples_split = trial.suggest_int(name='min_samples_split', low=2, high=10, step=2)
		max_depth = trial.suggest_int(name='max_depth', low=2, high=10, step=2)
		clf = RandomForestClassifier(
			n_estimators=n_estimators,
			criterion=criterion,
			min_samples_split=min_samples_split,
			max_depth=max_depth
		)

	score = cross_val_score(clf, X_train[top_5], y_train, n_jobs=-1, cv=3)
	accuracy = score.mean()
	return accuracy

study = optuna.create_study(
	direction='maximize',
	study_name='Breast Cancer Classifier',
	storage='sqlite:///breast_cancer.db',
	load_if_exists=True,
)

study.optimize(objective, n_trials=100)
print(f"Best value: {study.best_value} with best params: {study.best_params}")