In [None]:
#1.1
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load data
data = load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Full Tree
full_tree = DecisionTreeClassifier(random_state=42)
full_tree.fit(X_train, y_train)

# Pruned Tree
pruned_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
pruned_tree.fit(X_train, y_train)

# Accuracies
full_train_acc = full_tree.score(X_train, y_train)
full_test_acc  = full_tree.score(X_test, y_test)

pruned_train_acc = pruned_tree.score(X_train, y_train)
pruned_test_acc  = pruned_tree.score(X_test, y_test)

full_train_acc, full_test_acc, pruned_train_acc, pruned_test_acc


(1.0, 0.9473684210526315, 0.978021978021978, 0.9473684210526315)

In [None]:
#1.2
from sklearn.ensemble import RandomForestClassifier

# Random Forest with 100 trees
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Accuracies
rf_train_acc = rf_model.score(X_train, y_train)
rf_test_acc  = rf_model.score(X_test, y_test)

rf_train_acc, rf_test_acc


(1.0, 0.9649122807017544)

In [None]:
#1.3
from sklearn.ensemble import GradientBoostingClassifier

learning_rates = [0.01, 0.1]
n_estimators_list = [50, 100, 200]

results = []

for lr in learning_rates:
    for n_est in n_estimators_list:
        gb_model = GradientBoostingClassifier(
            learning_rate=lr,
            n_estimators=n_est,
            random_state=42
        )
        gb_model.fit(X_train, y_train)
        train_acc = gb_model.score(X_train, y_train)
        test_acc = gb_model.score(X_test, y_test)
        results.append((lr, n_est, train_acc, test_acc))

# Display results
for r in results:
    print(f"learning_rate={r[0]}, n_estimators={r[1]} -> Train Acc: {r[2]:.3f}, Test Acc: {r[3]:.3f}")


learning_rate=0.01, n_estimators=50 -> Train Acc: 0.978, Test Acc: 0.956
learning_rate=0.01, n_estimators=100 -> Train Acc: 0.987, Test Acc: 0.956
learning_rate=0.01, n_estimators=200 -> Train Acc: 0.993, Test Acc: 0.956
learning_rate=0.1, n_estimators=50 -> Train Acc: 1.000, Test Acc: 0.956
learning_rate=0.1, n_estimators=100 -> Train Acc: 1.000, Test Acc: 0.956
learning_rate=0.1, n_estimators=200 -> Train Acc: 1.000, Test Acc: 0.956


In [None]:
#1.4
import pandas as pd

# Get feature names
feature_names = data.feature_names

# Random Forest Feature Importances
rf_importances = rf_model.feature_importances_
rf_top5_idx = rf_importances.argsort()[::-1][:5]
rf_top5 = [(feature_names[i], rf_importances[i]) for i in rf_top5_idx]

# Gradient Boosting Feature Importances
gb_importances = gb_model.feature_importances_
gb_top5_idx = gb_importances.argsort()[::-1][:5]
gb_top5 = [(feature_names[i], gb_importances[i]) for i in gb_top5_idx]

print("Top 5 features (Random Forest):")
for f, imp in rf_top5:
    print(f"{f}: {imp:.3f}")

print("\nTop 5 features (Gradient Boosting):")
for f, imp in gb_top5:
    print(f"{f}: {imp:.3f}")


Top 5 features (Random Forest):
worst area: 0.154
worst concave points: 0.145
mean concave points: 0.106
worst radius: 0.078
mean concavity: 0.068

Top 5 features (Gradient Boosting):
mean concave points: 0.450
worst concave points: 0.240
worst radius: 0.076
worst perimeter: 0.051
worst texture: 0.040
