<a href="https://colab.research.google.com/github/senemcet/automlcourse/blob/main/heartdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import loguniform
import warnings
warnings.filterwarnings('ignore')

In [52]:
# Veriyi yükle
df = pd.read_csv("/content/heart_disease_uci.csv")
print(f"\nVeri yüklendi: {df.shape[0]} satır, {df.shape[1]} sütun")
df.head()



✓ Veri yüklendi: 920 satır, 16 sütun


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [53]:
# Eksik değerleri kontrol et
print(f"\nEksik değer sayıları:")
print(df.isnull().sum()[df.isnull().sum() > 0])

# Eksik değerleri doldur
for col in df.columns:
    if df[col].dtype != "object":
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

print("\nEksik değerler dolduruldu")


Eksik değer sayıları:
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
dtype: int64

Eksik değerler dolduruldu


In [54]:
# Kategorik değişkenleri one-hot encoding

cat_cols = ['cp', 'thal', 'slope', 'restecg']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
print(f"\nKategorik değişkenler encode edildi: {cat_cols}")


✓ Kategorik değişkenler encode edildi: ['cp', 'thal', 'slope', 'restecg']


In [55]:
# 'sex' ve 'dataset' sütunlarını encode et
df["sex"] = df["sex"].replace({"Male": 1, "Female": 0})
le = LabelEncoder()
df["dataset"] = le.fit_transform(df["dataset"])
print(" 'sex' ve 'dataset' sütunları encode edildi")

 'sex' ve 'dataset' sütunları encode edildi


In [56]:
# Hedef değişken oluştur (0: hastalık yok, 1: hastalık var)
df['target'] = (df['num'] > 0).astype(int)
df = df.drop(columns=['num', 'id'])

In [57]:
# Sayısal değişkenleri standardize et
num_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print(f"Sayısal değişkenler standardize edildi: {num_cols}")

 Sayısal değişkenler standardize edildi: ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']


In [58]:
# X ve y'yi ayır
X = df.drop("target", axis=1)
y = df["target"]

In [59]:
print(f"\nYeni veri boyutları:")
print(f"   - X : {X.shape}")
print(f"   - y : {y.shape}")
print(f"   - Sınıf dağılımı: {dict(y.value_counts())}")


Yeni veri boyutları:
   - X : (920, 19)
   - y : (920,)
   - Sınıf dağılımı: {1: np.int64(509), 0: np.int64(411)}


In [60]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nVeri ayrıldı:")
print(f"   - Train: {X_train.shape[0]} örneklem")
print(f"   - Test: {X_test.shape[0]} örneklem")


✓ Veri ayrıldı:
   - Train: 736 örneklem
   - Test: 184 örneklem


##RandomizedSearchCV

In [61]:
# Pipeline oluştur
pipeline_random = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=2000, random_state=42))
])

In [62]:
# Parametre dağılımı
param_dist = {
    "model__C": loguniform(1e-3, 1e3),
    "model__penalty": ["l2"],
    "model__solver": ["lbfgs"]
}

In [63]:
# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline_random,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=0,
    random_state=42
)

In [64]:
random_search.fit(X_train, y_train)

random_best_score = random_search.best_score_
random_best_params = random_search.best_params_
random_best_model = random_search.best_estimator_

print(f"\nRandomizedSearch Sonuçları:")
print(f"En iyi CV skoru: {random_best_score:.4f}")
print(f"En iyi parametreler: {random_best_params}")


 RandomizedSearch Sonuçları:
   - En iyi CV skoru: 0.8274
   - En iyi parametreler: {'model__C': np.float64(0.012329623163659839), 'model__penalty': 'l2', 'model__solver': 'lbfgs'}


##Optuna

In [65]:
!pip install optuna
import optuna

def objective(trial):
        C = trial.suggest_float("C", 1e-3, 1e3, log=True)

        model = LogisticRegression(
            C=C,
            solver="lbfgs",
            penalty="l2",
            max_iter=2000,
            random_state=42
        )

        score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
        return score

print("\nOptuna optimizasyonu başlatılıyor...")
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=30, show_progress_bar=False)

optuna_best_score = study.best_value
optuna_best_params = study.best_params

print(f"\nOptuna Sonuçları:")
print(f"En iyi CV skoru: {optuna_best_score:.4f}")
print(f"En iyi parametreler: {optuna_best_params}")




[I 2025-12-03 12:51:39,460] A new study created in memory with name: no-name-cbaebf6a-5344-4d67-b40d-88736f96fd48
[I 2025-12-03 12:51:39,526] Trial 0 finished with value: 0.8124471410185696 and parameters: {'C': 0.1767016940294795}. Best is trial 0 with value: 0.8124471410185696.
[I 2025-12-03 12:51:39,610] Trial 1 finished with value: 0.8110774039345469 and parameters: {'C': 506.1576888752306}. Best is trial 0 with value: 0.8124471410185696.



 Optuna optimizasyonu başlatılıyor...


[I 2025-12-03 12:51:39,690] Trial 2 finished with value: 0.8110774039345469 and parameters: {'C': 24.658329458549105}. Best is trial 0 with value: 0.8124471410185696.
[I 2025-12-03 12:51:39,767] Trial 3 finished with value: 0.8097168597168597 and parameters: {'C': 3.907967156822881}. Best is trial 0 with value: 0.8124471410185696.
[I 2025-12-03 12:51:39,823] Trial 4 finished with value: 0.8097812097812097 and parameters: {'C': 0.008632008168602538}. Best is trial 0 with value: 0.8124471410185696.
[I 2025-12-03 12:51:39,879] Trial 5 finished with value: 0.8097812097812097 and parameters: {'C': 0.008629132190071854}. Best is trial 0 with value: 0.8124471410185696.
[I 2025-12-03 12:51:39,933] Trial 6 finished with value: 0.7785438499724213 and parameters: {'C': 0.002231010801867922}. Best is trial 0 with value: 0.8124471410185696.
[I 2025-12-03 12:51:40,011] Trial 7 finished with value: 0.8110774039345469 and parameters: {'C': 157.41890047456639}. Best is trial 0 with value: 0.81244714101


Optuna Sonuçları:
   - En iyi CV skoru: 0.8206
   - En iyi parametreler: {'C': 0.0640299470632694}


##BayesSearchCV

In [66]:
!pip install scikit-optimize
# Gerekli kütüphaneler
from skopt import BayesSearchCV
from skopt.space import Real, Categorical


# Pipeline tanımı
pipeline_bayes = Pipeline([
    ("model", LogisticRegression(max_iter=2000, random_state=42))
])

# BayesSearchCV için parametre alanları
search_spaces = {
    "model__C": Real(1e-3, 1e3, prior="log-uniform"),
    "model__penalty": Categorical(["l2"]),
    "model__solver": Categorical(["lbfgs"])
}

# BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=pipeline_bayes,
    search_spaces=search_spaces,
    n_iter=30,         # 30 deneme
    cv=5,              # 5 katlı cross-validation
    scoring="accuracy",
    n_jobs=-1,         # Tüm CPU çekirdeklerini kullan
    random_state=42
)

# Modeli eğit
bayes_search.fit(X_train, y_train)

# En iyi sonuçlar
bayes_best_score = bayes_search.best_score_
bayes_best_params = bayes_search.best_params_
bayes_best_model = bayes_search.best_estimator_

print(f"\n BayesSearch Sonuçları:")
print(f"En iyi CV skoru: {bayes_best_score:.4f}")
print(f"En iyi parametreler: {bayes_best_params}")



 BayesSearch Sonuçları:
   - En iyi CV skoru: 0.8192
   - En iyi parametreler: OrderedDict({'model__C': 0.05625382383417035, 'model__penalty': 'l2', 'model__solver': 'lbfgs'})


##Random-Optuna-Bayes Karşılaştırılması

In [67]:
best_model_name = None
best_accuracy = 0
best_params = None

results = {
    "RandomSearch": (random_best_score, random_best_params),
    "Optuna": (optuna_best_score, optuna_best_params),
    "Bayes": (bayes_best_score, bayes_best_params),
}

for model, (score, params) in results.items():
    if score > best_accuracy:
        best_accuracy = score
        best_model_name = model
        best_params = params

print(f"\nTüm Yöntemlerin Karşılaştırması:")
print("-" * 60)
for method, (score, params) in sorted(results.items(), key=lambda x: x[1][0], reverse=True):
    print(f"{method:15s}: {score:.4f}")
print("-----")
print("En iyi arama yöntemi:", best_model_name)
print("En yüksek doğruluk (CV Accuracy):", best_accuracy)
print("En iyi hiperparametreler:", best_params)




Tüm Yöntemlerin Karşılaştırması:
------------------------------------------------------------
RandomSearch   : 0.8274
Optuna         : 0.8206
Bayes          : 0.8192
-----
En iyi arama yöntemi: RandomSearch
En yüksek doğruluk (CV Accuracy): 0.8274039345467917
En iyi hiperparametreler: {'model__C': np.float64(0.012329623163659839), 'model__penalty': 'l2', 'model__solver': 'lbfgs'}


In [68]:
#EN İYİ MODELİ TEST ET
best_model = random_best_model

# Test seti üzerinde değerlendir
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Doğruluğu: {test_accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test Doğruluğu: 0.8315

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.77      0.80        82
           1       0.83      0.88      0.85       102

    accuracy                           0.83       184
   macro avg       0.83      0.83      0.83       184
weighted avg       0.83      0.83      0.83       184


Confusion Matrix:
[[63 19]
 [12 90]]
