In [1]:
#6)Write a Python program to:Load the Breast Cancer dataset using sklearn.datasets.load_breast_cancer()

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

importances = rf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

feature_importance_df = feature_importance_df.sort_values(
    by='Importance', ascending=False
)

print("Top 5 Most Important Features:")
print(feature_importance_df.head(5))


Top 5 Most Important Features:
                 Feature  Importance
23            worst area    0.139357
27  worst concave points    0.132225
7    mean concave points    0.107046
20          worst radius    0.082848
22       worst perimeter    0.080850


In [4]:
#7)Write a Python program to:Train a Bagging Classifier using Decision Trees on the Iris dataset

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,
    random_state=42
)
bagging.fit(X_train, y_train)
bagging_pred = bagging.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_pred)

print("Accuracy of Single Decision Tree:", dt_accuracy)
print("Accuracy of Bagging Classifier:", bagging_accuracy)


Accuracy of Single Decision Tree: 1.0
Accuracy of Bagging Classifier: 1.0


In [5]:
#8) Write a Python program to:Train a Random Forest Classifier  Tune hyperparameters max_depth and n_estimators using GridSearchCV

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20]
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred)

print("Best Hyperparameters:", grid_search.best_params_)
print("Final Model Accuracy:", final_accuracy)


Best Hyperparameters: {'max_depth': None, 'n_estimators': 200}
Final Model Accuracy: 0.9707602339181286


In [6]:
#9)Write a Python program to:Train a Bagging Regressor and a Random Forest Regressor on the California Housing dataset

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data = fetch_california_housing()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

bagging_reg = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=100,
    random_state=42
)
bagging_reg.fit(X_train, y_train)
bagging_pred = bagging_reg.predict(X_test)
bagging_mse = mean_squared_error(y_test, bagging_pred)

rf_reg = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)
rf_reg.fit(X_train, y_train)
rf_pred = rf_reg.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)

print("Bagging Regressor MSE:", bagging_mse)
print("Random Forest Regressor MSE:", rf_mse)


Bagging Regressor MSE: 0.2568358813508342
Random Forest Regressor MSE: 0.25650512920799395


In [7]:
#10)You are working as a data scientist at a financial institution to predict loan default. You have access to customer demographic and transaction history data

from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_samples=5000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    weights=[0.7, 0.3],
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# 1. Bagging-based model (Random Forest)
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42
)

# 2. Boosting-based model (Gradient Boosting)
gb = GradientBoostingClassifier(
    n_estimators=150,
    max_depth=3,
    learning_rate=0.05,
    random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_auc = cross_val_score(rf, X_train, y_train, cv=cv, scoring='roc_auc')
gb_auc = cross_val_score(gb, X_train, y_train, cv=cv, scoring='roc_auc')

print("Random Forest Mean ROC-AUC:", rf_auc.mean())
print("Gradient Boosting Mean ROC-AUC:", gb_auc.mean())

gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
y_prob = gb.predict_proba(X_test)[:, 1]

print("Final Accuracy:", accuracy_score(y_test, y_pred))
print("Final ROC-AUC:", roc_auc_score(y_test, y_prob))


Random Forest Mean ROC-AUC: 0.9732259464021975
Gradient Boosting Mean ROC-AUC: 0.966355119936867
Final Accuracy: 0.904
Final ROC-AUC: 0.9501782660855888
