# Praktikum: Analisis Data Wisconsin Breast Cancer

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline


In [None]:

# Load dataset
df = pd.read_csv("wbc.csv")
df.head()


In [None]:
df.info()

In [None]:

# Kolom id tidak digunakan, diagnosis adalah target
X = df.drop(columns=["id", "diagnosis"])
y = df["diagnosis"]
X.shape, y.shape


In [None]:

# Encoding diagnosis M=1, B=0
le = LabelEncoder()
y = le.fit_transform(y)
y[:10]


In [None]:

# Buat pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("select", SelectKBest(score_func=f_classif, k=10)),  # default 10 fitur dulu
    ("model", LogisticRegression(max_iter=500))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
score


In [None]:

results = {}
for k in range(1, X.shape[1]+1):
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("select", SelectKBest(score_func=f_classif, k=k)),
        ("model", LogisticRegression(max_iter=500))
    ])
    scores = cross_val_score(pipe, X, y, cv=5)
    results[k] = np.mean(scores)

best_k = max(results, key=results.get)
best_score = results[best_k]
best_k, best_score


In [None]:

# Fit kembali dengan k terbaik
selector = SelectKBest(score_func=f_classif, k=best_k)
X_new = selector.fit_transform(StandardScaler().fit_transform(X), y)

# Ambil nama fitur yang terpilih
mask = selector.get_support()
selected_features = X.columns[mask]
selected_features



## Kesimpulan
- Jumlah fitur terbaik ditentukan dari nilai `best_k`.
- Daftar fitur yang terpilih dapat dilihat pada output `selected_features`.
