In [None]:
# ques1. Can we use Bagging for regression problems?
# ans1. Yes, Bagging can be used for both classification and regression problems. For regression, models like Bagging Regressor are used.

# ques2. What is the difference between multiple model training and single model training?
# ans2. Single model training uses one model on the whole dataset, while multiple model training trains many models (like in Bagging) and combines their predictions for better performance.

# ques3. Explain the concept of feature randomness in Random Forest.
# ans3. In Random Forest, a random subset of features is selected at each split to make the trees more diverse, reduce overfitting, and improve accuracy.

# ques4. What is OOB (Out-of-Bag) Score?
# ans4. OOB Score is an evaluation method where samples not included in a bootstrap training set are used to test the model, providing an unbiased performance estimate.

# ques5. How can you measure the importance of features in a Random Forest model?
# ans5. Feature importance can be measured using Gini Importance or Permutation Importance by evaluating how much each feature contributes to model performance.

# ques6. Explain the working principle of a Bagging Classifier.
# ans6. A Bagging Classifier creates multiple decision trees on different bootstrapped datasets and combines their predictions (majority vote) to reduce variance and overfitting.

# ques7. How do you evaluate a Bagging Classifier’s performance?
# ans7. You can evaluate it using metrics like Accuracy, Precision, Recall, F1-Score, OOB Score, Confusion Matrix, and Cross-validation.

# ques8. How does a Bagging Regressor work?
# ans8. It trains multiple regressors on bootstrapped samples and averages their predictions to produce a final output, reducing variance.

# ques9. What is the main advantage of ensemble techniques?
# ans9. Ensemble techniques improve accuracy, reduce overfitting, and make the model more robust by combining predictions from multiple models.

# ques10. What is the main challenge of ensemble methods?
# ans10. They increase complexity, require more computational resources, and can be harder to interpret.

# ques11. Explain the key idea behind ensemble techniques.
# ans11. The idea is to combine multiple weak models to build a strong model that performs better than individual ones.

# ques12. What is a Random Forest Classifier?
# ans12. It's an ensemble of decision trees where each tree is trained on a bootstrapped dataset with random feature selection at each split, and final prediction is based on majority vote.

# ques13. What are the main types of ensemble techniques?
# ans13. The main types are Bagging, Boosting, and Stacking.

# ques14. What is ensemble learning in machine learning?
# ans14. It’s a method that combines predictions from multiple models to improve performance and generalization.

# ques15. When should we avoid using ensemble methods?
# ans15. Avoid them when the dataset is too small, model interpretability is crucial, or computational efficiency is important.

# ques16. How does Bagging help in reducing overfitting?
# ans16. Bagging reduces overfitting by training models on different bootstrapped samples and averaging their predictions to smooth out noise.

# ques17. Why is Random Forest better than a single Decision Tree?
# ans17. Because it reduces overfitting, handles noisy data better, and gives more stable and accurate results.

# ques18. What is the role of bootstrap sampling in Bagging?
# ans18. Bootstrap sampling creates different training sets by sampling with replacement, adding diversity and making the model more robust.

# ques19. What are some real-world applications of ensemble techniques?
# ans19. Ensemble methods are used in spam detection, fraud detection, medical diagnosis, stock market prediction, and customer churn prediction.

# ques20. What is the difference between Bagging and Boosting?
# ans20.
# - Bagging trains models independently and reduces variance (e.g., Random Forest).
# - Boosting trains models sequentially, focuses on errors, and reduces bias (e.g., AdaBoost, XGBoost).


In [2]:
# ques1. Train a Bagging Classifier using Decision Trees on a sample dataset and print model accuracy.
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Replace 'base_estimator' with 'estimator'
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [4]:
# ques2. Train a Bagging Regressor using Decision Trees and evaluate using Mean Squared Error (MSE).
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bagging_regressor = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=10, random_state=42)
bagging_regressor.fit(X_train, y_train)
y_pred = bagging_regressor.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))


MSE: 0.2824242776841025


In [5]:
# ques3. Train a Random Forest Classifier on the Breast Cancer dataset and print feature importance scores.
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier

X, y = load_breast_cancer(return_X_y=True)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

importances = model.feature_importances_
for name, importance in zip(load_breast_cancer().feature_names, importances):
    print(f"{name}: {importance:.4f}")


mean radius: 0.0348
mean texture: 0.0152
mean perimeter: 0.0680
mean area: 0.0605
mean smoothness: 0.0080
mean compactness: 0.0116
mean concavity: 0.0669
mean concave points: 0.1070
mean symmetry: 0.0034
mean fractal dimension: 0.0026
radius error: 0.0143
texture error: 0.0037
perimeter error: 0.0101
area error: 0.0296
smoothness error: 0.0047
compactness error: 0.0056
concavity error: 0.0058
concave points error: 0.0038
symmetry error: 0.0035
fractal dimension error: 0.0059
worst radius: 0.0828
worst texture: 0.0175
worst perimeter: 0.0808
worst area: 0.1394
worst smoothness: 0.0122
worst compactness: 0.0199
worst concavity: 0.0373
worst concave points: 0.1322
worst symmetry: 0.0082
worst fractal dimension: 0.0045


In [6]:
# ques4. Train a Random Forest Regressor and compare its performance with a single Decision Tree.
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

tree_model = DecisionTreeRegressor(random_state=42)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

tree_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

tree_preds = tree_model.predict(X_test)
rf_preds = rf_model.predict(X_test)

print("Decision Tree R2 Score:", r2_score(y_test, tree_preds))
print("Random Forest R2 Score:", r2_score(y_test, rf_preds))


Decision Tree R2 Score: 0.622075845135081
Random Forest R2 Score: 0.8051230593157366


In [9]:
# ques5. Compute the Out-of-Bag (OOB) Score for a Random Forest Classifier.
from sklearn.ensemble import RandomForestRegressor # Import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42) # Change to RandomForestRegressor
rf.fit(X_train, y_train)
print("OOB Score:", rf.oob_score_)


OOB Score: 0.8074158009726365


In [14]:
# ques6. Train a Bagging Classifier using SVM as a base estimator and print accuracy.
# ques6. Train a Bagging Classifier using SVM as a base estimator and print accuracy.
from sklearn.svm import SVC
from sklearn.ensemble import BaggingRegressor # Import BaggingRegressor for Regression tasks
# Use SVR (Support Vector Regressor) for regression tasks
from sklearn.svm import SVR

# Change BaggingClassifier to BaggingRegressor
# Use SVR as the base estimator
bagging_svm = BaggingRegressor(estimator=SVR(), n_estimators=10, random_state=42)
bagging_svm.fit(X_train, y_train)
y_pred = bagging_svm.predict(X_test)

# You might want to use a regression metric like mean_squared_error instead of accuracy
from sklearn.metrics import mean_squared_error
print("MSE:", mean_squared_error(y_test, y_pred))


MSE: 1.3317844753767791


In [15]:
# ques7. Train a Random Forest Regressor with different numbers of trees and compare performance (e.g., MSE).
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

for n in [10, 50, 100, 200]:
    rf = RandomForestRegressor(n_estimators=n, random_state=42)  # Changed to RandomForestRegressor
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)  # Using MSE for regression
    print(f"MSE with {n} trees: {mse:.4f}")


MSE with 10 trees: 0.2842
MSE with 50 trees: 0.2573
MSE with 100 trees: 0.2554
MSE with 200 trees: 0.2540


In [16]:
# ques8. Train a Bagging Classifier using Logistic Regression as a base estimator and print AUC score.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

y_bin = label_binarize(y, classes=[0, 1, 2])[:, 0]  # Adjust for binary AUC
X_train, X_test, y_train_bin, y_test_bin = train_test_split(X, y_bin, test_size=0.2, random_state=42)

bagging_logreg = BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=10, random_state=42)
bagging_logreg.fit(X_train, y_train_bin)
y_proba = bagging_logreg.predict_proba(X_test)[:, 1]

print("AUC Score:", roc_auc_score(y_test_bin, y_proba))


TypeError: BaggingClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [None]:
# ques9. Train a Random Forest Regressor and analyze feature importance scores.
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
for name, importance in zip(fetch_california_housing().feature_names, rf.feature_importances_):
    print(f"{name}: {importance:.4f}")


In [18]:
# ques10. Train an ensemble model using both Bagging and Random Forest and compare accuracy.
bag_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

bag_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

acc_bag = accuracy_score(y_test, bag_model.predict(X_test))
acc_rf = accuracy_score(y_test, rf_model.predict(X_test))

print("Bagging Accuracy:", acc_bag)
print("Random Forest Accuracy:", acc_rf)


ValueError: Found input variables with inconsistent numbers of samples: [455, 16512]

In [19]:
# ques0,= Train a Random Forest Classifier and tune hyperparameters using GridSearchCV.
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset
X, y = load_iris(return_X_y=True)

# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}

# GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3)
grid_search.fit(X, y)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Best score: 0.9666666666666667


In [20]:
# ques05= Train a Bagging Regressor with different numbers of base estimators and compare performance.
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Generate a regression dataset
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=100, n_features=10, noise=0.1, random_state=42)

# Split dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Bagging Regressors with different base estimators
for n in [10, 50, 100]:
    bag_regressor = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=n, random_state=42)
    bag_regressor.fit(X_train, y_train)
    y_pred = bag_regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE with {n} estimators:", mse)


TypeError: BaggingRegressor.__init__() got an unexpected keyword argument 'base_estimator'

In [None]:
# ques00= Train a Random Forest Classifier and analyze misclassified samples.
from sklearn.metrics import confusion_matrix
import numpy as np

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict and compute confusion matrix
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Identify misclassified samples
misclassified = np.where(y_pred != y_test)[0]
print(f"Misclassified samples: {misclassified}")


In [None]:
# ques0= Train a Bagging Classifier and compare its performance with a single Decision Tree Classifier.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Train Bagging Classifier and Decision Tree Classifier
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
tree = DecisionTreeClassifier(random_state=42)

# Train both models
bagging.fit(X_train, y_train)
tree.fit(X_train, y_train)

# Compare accuracy
bagging_acc = accuracy_score(y_test, bagging.predict(X_test))
tree_acc = accuracy_score(y_test, tree.predict(X_test))

print(f"Bagging Classifier Accuracy: {bagging_acc}")
print(f"Decision Tree Classifier Accuracy: {tree_acc}")


In [None]:
# ques0= Train a Random Forest Classifier and visualize the confusion matrix.
import seaborn as sns
import matplotlib.pyplot as plt

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict and compute confusion matrix
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=load_iris().target_names, yticklabels=load_iris().target_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# ques0= Train a Stacking Classifier using Decision Trees, SVM, and Logistic Regression, and compare accuracy.
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Create base models
estimators = [
    ('decision_tree', DecisionTreeClassifier(random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('logreg', LogisticRegression(random_state=42))
]

# Create the stacking classifier
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Train the model
stacking_model.fit(X_train, y_train)

# Evaluate accuracy
accuracy = stacking_model.score(X_test, y_test)
print("Stacking Classifier Accuracy:", accuracy)


In [None]:
# ques0= Train a Random Forest Classifier and print the top 5 most important features.
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance and print top 5
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("Top 5 most important features:")
for i in range(5):
    print(f"{load_iris().feature_names[indices[i]]}: {importances[indices[i]]:.4f}")


In [21]:
# ques0	= Train a Bagging Classifier and evaluate performance using Precision, Recall, and F1-score.
from sklearn.metrics import precision_score, recall_score, f1_score

# Train the model
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging.fit(X_train, y_train)

# Predict and compute metrics
y_pred = bagging.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


TypeError: BaggingClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [None]:
# ques04= Train a Random Forest Classifier and analyze the effect of max_depth on accuracy.
for max_depth in [5, 10, None]:
    rf = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=42)
    rf.fit(X_train, y_train)
    acc = accuracy_score(y_test, rf.predict(X_test))
    print(f"Accuracy with max_depth={max_depth}: {acc:.4f}")


In [22]:
# ques= Train a Bagging Regressor using different base estimators (DecisionTree and KNeighbors) and compare performance.
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Train with different base estimators
for base_estimator in [DecisionTreeRegressor(), KNeighborsRegressor()]:
    bag_regressor = BaggingRegressor(base_estimator=base_estimator, n_estimators=10, random_state=42)
    bag_regressor.fit(X_train, y_train)
    y_pred = bag_regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE with {base_estimator.__class__.__name__}: {mse:.4f}")


TypeError: BaggingRegressor.__init__() got an unexpected keyword argument 'base_estimator'

In [26]:
# ques,= Train a Random Forest Classifier and evaluate its performance using ROC-AUC Score.
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

# Binarize the labels for ROC-AUC
y_bin = label_binarize(y, classes=[0, 1, 2])[:, 0]

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Compute ROC-AUC score
y_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_bin, y_pred_proba)
print("ROC-AUC Score:", roc_auc)


ValueError: continuous target data is not supported with label binarization

In [25]:
# ques5= Train a Bagging Classifier and evaluate its performance using cross-validation.
from sklearn.model_selection import cross_val_score

# Train the model
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

# Evaluate using cross-validation
cv_scores = cross_val_score(bagging, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean():.4f}")


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/ensemble/_bagging.py", line 389, in fit
    return self._fit(X, y, max_samples=self.max_samples, **fit_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/ensemble/_bagging.py", line 443, in _fit
    y = self._validate_y(y)
        ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/ensemble/_bagging.py", line 879, in _validate_y
    check_classification_targets(y)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/multiclass.py", line 222, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.


In [None]:
# ques0= Train a Random Forest Classifier and plot the Precision-Recall curve.
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get Precision-Recall curve
y_scores = rf.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_scores)

# Plot the curve
plt.plot(recall, precision, color='b', label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.show()


In [27]:
# que= Train a Stacking Classifier with Random Forest and Logistic Regression and compare accuracy.
from sklearn.ensemble import StackingClassifier # Import StackingClassifier
stacking_model = StackingClassifier(
    estimators=[('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
                ('logreg', LogisticRegression(random_state=42))],
    final_estimator=LogisticRegression(random_state=42)
)

stacking_model.fit(X_train, y_train)
accuracy = stacking_model.score(X_test, y_test)
print(f"Stacking Classifier Accuracy: {accuracy:.4f}")


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.