1.  Train a Bagging Classifier using Decision Trees on a sample dataset and print model accuracy.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Bagging Classifier with Decision Trees
# Replacing 'base_estimator' with 'estimator' to be compatible with newer scikit-learn versions
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)

# Train the model
bagging_clf.fit(X_train, y_train)

# Make predictions
y_pred = bagging_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

2. Train a Bagging Regressor using Decision Trees and evaluate using Mean Squared Error (MSE).

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Generate a sample dataset
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Bagging Regressor with Decision Trees
# Replacing 'base_estimator' with 'estimator' to be compatible with newer scikit-learn versions
bagging_reg = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=50, random_state=42)

# Train the model
bagging_reg.fit(X_train, y_train)

# Make predictions
y_pred = bagging_reg.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Model MSE: {mse:.2f}')

3. Train a Random Forest Classifier on the Breast Cancer dataset and print feature importance scores.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)

# Get feature importance scores
feature_importances = pd.DataFrame({'Feature': data.feature_names, 'Importance': rf_clf.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print feature importance scores
print(feature_importances)


4. Train a Random Forest Regressor and compare its performance with a single Decision Tree.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Generate a sample dataset
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Single Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)
y_pred_dt = dt_reg.predict(X_test)
dt_mse = mean_squared_error(y_test, y_pred_dt)

# Train a Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)
rf_mse = mean_squared_error(y_test, y_pred_rf)

# Print the results
print(f"Decision Tree MSE: {dt_mse:.2f}")
print(f"Random Forest MSE: {rf_mse:.2f}")

5. Compute the Out-of-Bag (OOB) Score for a Random Forest Classifier.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier with OOB score enabled
rf_clf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)

# Print the OOB score
print(f"Out-of-Bag (OOB) Score: {rf_clf.oob_score_:.2f}")

6. Train a Bagging Classifier using SVM as a base estimator and print accuracy.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Bagging Classifier with SVM as base estimator
# Replacing 'base_estimator' with 'estimator' to be compatible with newer scikit-learn versions
bagging_clf = BaggingClassifier(estimator=SVC(), n_estimators=50, random_state=42)

# Train the model
bagging_clf.fit(X_train, y_train)

# Make predictions
y_pred = bagging_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

7. Train a Random Forest Classifier with different numbers of trees and compare accuracy.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of different numbers of trees to test
n_estimators_list = [10, 50, 100, 200]

# Train and evaluate Random Forest with different numbers of trees
for n_estimators in n_estimators_list:
    rf_clf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_clf.fit(X_train, y_train)
    y_pred = rf_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Number of Trees: {n_estimators}, Accuracy: {accuracy:.2f}')


8. Train a Bagging Classifier using Logistic Regression as a base estimator and print AUC score.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score

# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Bagging Classifier with Logistic Regression as base estimator
bagging_clf = BaggingClassifier(estimator=LogisticRegression(solver='lbfgs', max_iter=1000),
                                n_estimators=50, random_state=42)

# Train the model
bagging_clf.fit(X_train, y_train)

# Make predictions
y_pred_prob = bagging_clf.predict_proba(X_test)[:, 1]

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f'Model AUC Score: {auc_score:.2f}')


9. Train a Random Forest Regressor and analyze feature importance scores.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import pandas as pd

# Generate a sample dataset
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

# Get feature importance scores
feature_importances = pd.DataFrame({'Feature': [f'Feature {i}' for i in range(X.shape[1])],
                                    'Importance': rf_reg.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print feature importance scores
print(feature_importances)


10. Train an ensemble model using both Bagging and Random Forest and compare accuracy.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Generate a sample dataset
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Bagging Regressor
bagging_reg = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=50, random_state=42)
bagging_reg.fit(X_train, y_train)
y_pred_bagging = bagging_reg.predict(X_test)
bagging_mse = mean_squared_error(y_test, y_pred_bagging)

# Train a Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)
rf_mse = mean_squared_error(y_test, y_pred_rf)

# Print the results
print(f"Bagging Regressor MSE: {bagging_mse:.2f}")
print(f"Random Forest Regressor MSE: {rf_mse:.2f}")


11.  Train a Random Forest Classifier and tune hyperparameters using GridSearchCV.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)

# Perform Grid Search with Cross Validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_rf_clf = grid_search.best_estimator_

# Make predictions
y_pred = best_rf_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Model Accuracy: {accuracy:.2f}')

12. Train a Bagging Regressor with different numbers of base estimators and compare performance.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Generate a sample dataset
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of different numbers of base estimators to test
n_estimators_list = [10, 50, 100, 200]

# Train and evaluate Bagging Regressor with different numbers of estimators
for n_estimators in n_estimators_list:
    bagging_reg = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=n_estimators, random_state=42)
    bagging_reg.fit(X_train, y_train)
    y_pred = bagging_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Number of Estimators: {n_estimators}, MSE: {mse:.2f}')


13.  Train a Random Forest Classifier and analyze misclassified samples.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
import numpy as np

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred = rf_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

# Identify misclassified samples
misclassified_indices = np.where(y_test != y_pred)[0]
print(f'Number of Misclassified Samples: {len(misclassified_indices)}')
print('Indices of Misclassified Samples:', misclassified_indices)


14.  Train a Bagging Classifier and compare its performance with a single Decision Tree Classifier.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a single Decision Tree Classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

# Train a Bagging Classifier with Decision Tree as base estimator
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging_clf.fit(X_train, y_train)
y_pred_bagging = bagging_clf.predict(X_test)
bagging_accuracy = accuracy_score(y_test, y_pred_bagging)

# Print accuracy comparison
print(f'Decision Tree Accuracy: {dt_accuracy:.2f}')
print(f'Bagging Classifier Accuracy: {bagging_accuracy:.2f}')


15. Train a Random Forest Classifier and visualize the confusion matrix.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred = rf_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


16. Train a Stacking Classifier using Decision Trees, SVM, and Logistic Regression, and compare accuracy.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base classifiers
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train individual classifiers and evaluate performance
for clf, name in zip([dt, svm, logreg], ["Decision Tree", "SVM", "Logistic Regression"]):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Define stacking classifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base classifiers
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train individual classifiers and evaluate performance
for clf, name in zip([dt, svm, logreg], ["Decision Tree", "SVM", "Logistic Regression"]):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Define stacking classifier
stacking_clf = StackingClassifier(
    estimators=[('dt', dt), ('svm', svm), ('logreg', logreg)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42))

# Train and evaluate stacking classifier
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")


17.  Train a Random Forest Classifier and print the top 5 most important features.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base classifiers
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train individual classifiers and evaluate performance
for clf, name in zip([dt, svm, logreg], ["Decision Tree", "SVM", "Logistic Regression"]):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Define stacking classifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base classifiers
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train individual classifiers and evaluate performance
for clf, name in zip([dt, svm, logreg], ["Decision Tree", "SVM", "Logistic Regression"]):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Define stacking classifier
stacking_clf = StackingClassifier(
    estimators=[('dt', dt), ('svm', svm), ('logreg', logreg)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

# Train and evaluate stacking classifier
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_
feature_names = data.feature_names

# Print top 5 most important features
important_features = sorted(zip(feature_importances, feature_names), reverse=True)[:5]
print("Top 5 Most Important Features:")
for importance, name in important_features:
    print(f"{name}: {importance:.4f}")
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_
feature_names = data.feature_names

# Print top 5 most important features
important_features = sorted(zip(feature_importances, feature_names), reverse=True)[:5]
print("Top 5 Most Important Features:")
for importance, name in important_features:
    print(f"{name}: {importance:.4f}")


18.  Train a Bagging Classifier and evaluate performance using Precision, Recall, and F1-score.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import load_breast_cancer

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base classifiers
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train individual classifiers and evaluate performance
for clf, name in zip([dt, svm, logreg], ["Decision Tree", "SVM", "Logistic Regression"]):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Define stacking classifier (Fixed Syntax Error)
stacking_clf = StackingClassifier(
    estimators=[('dt', dt), ('svm', svm), ('logreg', logreg)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

# Train and evaluate stacking classifier
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_
feature_names = data.feature_names

# Print top 5 most important features
important_features = sorted(zip(feature_importances, feature_names), reverse=True)[:5]
print("Top 5 Most Important Features:")
for importance, name in important_features:
    print(f"{name}: {importance:.4f}")

# Train a Bagging Classifier
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging_clf.fit(X_train, y_train)
y_pred_bagging = bagging_clf.predict(X_test)

# Evaluate Bagging Classifier performance
precision = precision_score(y_test, y_pred_bagging)
recall = recall_score(y_test, y_pred_bagging)
f1 = f1_score(y_test, y_pred_bagging)

print("Bagging Classifier Performance:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


19. Train a Random Forest Classifier and analyze the effect of max_depth on accuracy.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base classifiers
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train individual classifiers and evaluate performance
for clf, name in zip([dt, svm, logreg], ["Decision Tree", "SVM", "Logistic Regression"]):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Define stacking classifier (Fixed Syntax Error)
stacking_clf = StackingClassifier(
    estimators=[('dt', dt), ('svm', svm), ('logreg', logreg)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

# Train and evaluate stacking classifier
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_
feature_names = data.feature_names

# Print top 5 most important features
important_features = sorted(zip(feature_importances, feature_names), reverse=True)[:5]
print("Top 5 Most Important Features:")
for importance, name in important_features:
    print(f"{name}: {importance:.4f}")

# Train a Bagging Classifier
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging_clf.fit(X_train, y_train)
y_pred_bagging = bagging_clf.predict(X_test)

# Evaluate Bagging Classifier performance
precision = precision_score(y_test, y_pred_bagging)
recall = recall_score(y_test, y_pred_bagging)
f1 = f1_score(y_test, y_pred_bagging)

print("Bagging Classifier Performance:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Analyze the effect of max_depth on accuracy
max_depth_values = range(1, 21)
accuracy_scores = []

for max_depth in max_depth_values:
    rf = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

# Plot max_depth vs accuracy
plt.figure(figsize=(10, 5))
plt.plot(max_depth_values, accuracy_scores, marker='o', linestyle='dashed', color='b')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Effect of max_depth on Random Forest Accuracy')
plt.grid()
plt.show()


20.  Train a Bagging Regressor using different base estimators (DecisionTree and KNeighbors) and compare
performance.

In [None]:
# In the BaggingRegressor initialization, replace 'base_estimator' with 'estimator

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error
from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt

# Load classification dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base classifiers
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Train individual classifiers and evaluate performance
for clf, name in zip([dt, svm, logreg], ["Decision Tree", "SVM", "Logistic Regression"]):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Define stacking classifier
stacking_clf = StackingClassifier(
    estimators=[('dt', dt), ('svm', svm), ('logreg', logreg)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)

# Train and evaluate stacking classifier
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_
feature_names = data.feature_names

# Print top 5 most important features
important_features = sorted(zip(feature_importances, feature_names), reverse=True)[:5]
print("Top 5 Most Important Features:")
for importance, name in important_features:
    print(f"{name}: {importance:.4f}")

# Train a Bagging Classifier
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42) # Replacing 'base_estimator' with 'estimator' to be compatible with newer scikit-learn versions
bagging_clf.fit(X_train, y_train)
y_pred_bagging = bagging_clf.predict(X_test)

# Evaluate Bagging Classifier performance
precision = precision_score(y_test, y_pred_bagging)
recall = recall_score(y_test, y_pred_bagging)
f1 = f1_score(y_test, y_pred_bagging)

print("Bagging Classifier Performance:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Load regression dataset
data_reg = load_diabetes()
X_reg, y_reg = data_reg.data, data_reg.target
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Train Bagging Regressor with Decision Tree and KNeighbors
dt_reg = DecisionTreeRegressor(random_state=42)
knn_reg = KNeighborsRegressor()

bagging_dt = BaggingRegressor(estimator=dt_reg, n_estimators=50, random_state=42)
bagging_knn = BaggingRegressor(estimator=knn_reg, n_estimators=50, random_state=42)

bagging_dt.fit(X_train_reg, y_train_reg)
bagging_knn.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_dt = bagging_dt.predict(X_test_reg)
y_pred_knn = bagging_knn.predict(X_test_reg)

# Evaluate performance
mae_dt = mean_absolute_error(y_test_reg, y_pred_dt)
mse_dt = mean_squared_error(y_test_reg, y_pred_dt)
mae_knn = mean_absolute_error(y_test_reg, y_pred_knn)
mse_knn = mean_squared_error(y_test_reg, y_pred_knn)

print("Bagging Regressor Performance:")
print(f"Decision Tree - MAE: {mae_dt:.4f}, MSE: {mse_dt:.4f}")
print(f"KNeighbors - MAE: {mae_knn:.4f}, MSE: {mse_knn:.4f}")


21.  Train a Random Forest Classifier and evaluate its performance using ROC-AUC Score.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict probabilities for the positive class
y_probs = rf.predict_proba(X_test)[:, 1]

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y_test, y_probs)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Plot the ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_probs)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


22.  Train a Bagging Classifier and evaluate its performance using cross-validation


In [None]:
# Train a Bagging Classifier
# Replacing 'base_estimator' with 'estimator' to be compatible with newer scikit-learn versions
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
scores = cross_val_score(bagging_clf, X_train, y_train, cv=5, scoring='accuracy')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Bagging Classifier
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
scores = cross_val_score(bagging_clf, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation scores
print(f"Cross-Validation Accuracy Scores: {scores}")
print(f"Mean Accuracy: {scores.mean():.4f}")

# Train on full training data and evaluate on test data
bagging_clf.fit(X_train, y_train)
y_pred = bagging_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


23. Train a Random Forest Classifier and plot the Precision-Recall curve

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, average_precision_score

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict probabilities for the positive class
y_probs = rf.predict_proba(X_test)[:, 1]

# Calculate Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_probs)
avg_precision = average_precision_score(y_test, y_probs)

# Plot the Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'PR Curve (AP = {avg_precision:.4f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()


24. Train a Stacking Classifier with Random Forest and Logistic Regression and compare accuracy.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base and meta classifiers
base_clf = RandomForestClassifier(n_estimators=100, random_state=42)
meta_clf = LogisticRegression()

# Train individual Random Forest Classifier
base_clf.fit(X_train, y_train)
y_pred_base = base_clf.predict(X_test)
accuracy_base = accuracy_score(y_test, y_pred_base)
print(f"Random Forest Accuracy: {accuracy_base:.4f}")

# Train a Stacking Classifier
stacking_clf = StackingClassifier(estimators=[('rf', base_clf)], final_estimator=meta_clf, passthrough=True)
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
print(f"Stacking Classifier Accuracy: {accuracy_stack:.4f}")


25. Train a Bagging Regressor with different levels of bootstrap samples and compare performance.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Generate synthetic regression data
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define different bootstrap sample sizes
bootstrap_samples = [0.5, 0.7, 1.0]
errors = []

plt.figure(figsize=(8, 6))

for sample_size in bootstrap_samples:
    bagging_reg = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=100,
                                   max_samples=sample_size, random_state=42)
    bagging_reg.fit(X_train, y_train)
    y_pred = bagging_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    errors.append(mse)
    print(f"Bagging Regressor (Bootstrap Sample = {sample_size}) MSE: {mse:.4f}")

# Plot performance comparison
plt.bar([str(s) for s in bootstrap_samples], errors, color=['blue', 'green', 'red'])
plt.xlabel('Bootstrap Sample Size')
plt.ylabel('Mean Squared Error')
plt.title('Performance of Bagging Regressor with Different Bootstrap Samples')
plt.show()
