In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Generate synthetic heart disease dataset
X, y = make_classification(
    n_samples=1000, n_features=8, n_informative=5, n_redundant=1,
    n_clusters_per_class=2, weights=[0.7, 0.3], random_state=42
)

df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(8)])
df["heart_disease"] = y
print(df["heart_disease"].value_counts())

# Prepare data
X = df.drop("heart_disease", axis=1)
y = df["heart_disease"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))

# Current (incorrect) CV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print("CV scores:", scores)
