In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

### Data Exploration and Understanding: 

In [None]:
# Load the configuration from the YAML file
with open("config.yml", "r") as file:
    config = yaml.safe_load(file)

# Get the dataset path from the configuration
dataset = config["dataset"]["path"]

# Load the dataset
data = pd.read_csv(dataset)
data.head() 

## Inspection

In [None]:
print(data.shape)
data.info()

In [None]:
# Check missing values
print(data.describe())
print("Missing values:", data.isnull().sum().sum())


In [None]:
data['diagnosis']=data['diagnosis'].map({'M': 1, 'B': 0}).values
# Visualization of the target variable distribution
sns.countplot(data['diagnosis'])
plt.show()
print(data['diagnosis'].value_counts())

### Separate features and target variable

In [None]:
# Separate features and target variable
X = data.drop(['id','diagnosis'], axis=1)
y = data['diagnosis']

In [None]:
corr_matrix = X.corr()
corr_matrix.abs().idxmax()
# Strip out the diagonal values for the next step
for x in range(len(X.columns)):
    corr_matrix.iloc[x,x] = 0.0
    
corr_matrix# features are highly correlated

sns.heatmap(corr_matrix,cmap="coolwarm")
plt.show()

### Scale features

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Check the skewness of the features 
Xscaled = pd.DataFrame(X_scaled, columns=X.columns)
Xscaled.head()

### Check the skewness of the features

In [None]:

# .skew 0: no skew, + right skew, - left skew, look for above .75 
skew_columns = Xscaled.skew().sort_values(ascending=False)
skew_columns = skew_columns.loc[skew_columns > 0.75]
skew_columns

In [None]:
from sklearn.preprocessing import PowerTransformer

# Perform Power Transformation on the skewed columns
power_transformer = PowerTransformer()

transformed_data = power_transformer.fit_transform(Xscaled[skew_columns])
Xscaled[skew_columns] = transformed_data
Xscaled.head()

### Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xscaled, y)

The bagging classifier involves training multiple models in parallel with replacement and averaging their predictions, is useful for reducing variance.
It can help improve the stability and generalization of the models.

Boosting methods, are known for their ability to reduce bias and improve the overall performance of the models.

A Dummy Classifier is a type of classifier which does not generate any insight about the data and classifies the given data using only simple rules.
This classifier is useful as a simple baseline to compare with other (real) classifiers.

### Bagging classifier


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = BaggingClassifier(estimator=RandomForestClassifier(), n_estimators=10, random_state=0)


clf.fit(X_train, y_train)

# Perform cross-validation on the BaggingClassifier
rf_scores = cross_val_score(clf, X_train, y_train, cv=5)

# Evaluate the BaggingClassifier
rf_predictions = clf.predict(X_test)
rf_report = classification_report(y_test, rf_predictions)

# Define and train an AdaBoost classifier
ada_classifier = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_classifier.fit(X_train, y_train)

# Perform cross-validation on the AdaBoost classifier
ada_scores = cross_val_score(ada_classifier, X_train, y_train, cv=5)

# Evaluate the AdaBoost classifier
ada_predictions = ada_classifier.predict(X_test)
ada_report = classification_report(y_test, ada_predictions)

# Print the cross-validation scores and evaluation reports
print("BaggingClassifier Cross-Validation Scores:")
print(rf_scores)
print("BaggingClassifier Evaluation Report:")
print(rf_report)
print("\nAdaBoost Cross-Validation Scores:")
print(ada_scores)
print("AdaBoost Evaluation Report:")
print(ada_report)

# Experiment with different configurations for bagging and boosting models
# Example: BaggingClassifier with different number of estimators
rf_scores = []
num_estimators = [50, 100, 200]

for n_estimators in num_estimators:
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
    rf_scores.append(scores.mean())

# Print the cross-validation scores for different number of estimators
print("\nBaggingClassifier Cross-Validation Scores for Different Number of Estimators:")
for n, score in zip(num_estimators, rf_scores):
    print(f"Number of Estimators: {n} | Score: {score}")

In [None]:
# Bagging classifier with different estimators
estimators = [SVC(), RandomForestClassifier(),GradientBoostingClassifier()]
 
for estimator in estimators:
    clf = BaggingClassifier(base_estimator=estimator, n_estimators=10, random_state=0)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    report = classification_report(y_test, pred) 
    print(f"Bagging with {estimator.__class__.__name__} accuracy: {accuracy}")
    print(f"Classification Report:\n{report}\n")

In Bagging classifiers with different base estimators (SVC, RandomForestClassifier, GradientBoostingClassifier):

**Bagging with SVC:**
Accuracy: 98%
Precision and recall are high for both classes (0 and 1).
F1-score indicates balanced performance.

**Bagging with RandomForestClassifier:**
Accuracy: 94%
Precision and recall are slightly lower compared to SVC.
F1-score is still good but slightly lower than SVC.

**Bagging with GradientBoostingClassifier:**
Accuracy: 97%
Precision and recall are high for both classes.
F1-score indicates balanced performance.

In general, all Bagging classifiers perform well and show relatively high accuracy and balanced precision and recall scores for both classes. The choice of base estimator (SVC, RandomForest, or Gradient Boosting) affects the performance slightly, but overall, Bagging effectively improves model accuracy and robustness by combining multiple base models.

### Boosting classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# List of different numbers of estimators
estimator_values = [1, 10, 50, 100, 200]

for n_estimators in estimator_values:
    # Create a parameter grid for the GridSearchCV
    param_grid = {'n_estimators': [n_estimators]}
    
    # Create a boosting classifier with current n_estimators
    clf = GridSearchCV(AdaBoostClassifier(), param_grid, cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    
    # Predict using the trained boosting classifier
    boosting_pred = clf.predict(X_test)
    
    # Calculate accuracy
    boosting_accuracy = accuracy_score(y_test, boosting_pred)
    
    # Print results
    print(f"Boosting with {n_estimators} estimators")
    print(f"Accuracy: {boosting_accuracy}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, boosting_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, boosting_pred)}\n")


Boosting with 1 estimators:

Accuracy: 0.8947
Confusion Matrix: [[63 8] [ 4 39]]
In this case, using only 1 estimator results in decent accuracy.

Boosting with 10 estimators:
Accuracy: 0.9825
Confusion Matrix: [[71 0] [ 2 41]]
With 10 estimators, the accuracy increases significantly, as seen from both the confusion matrix and classification report. The model is able to predict most of the instances correctly.

Boosting with 50 estimators:
Accuracy: 0.9737
Confusion Matrix: [[70 1] [ 2 41]]
Increasing the number of estimators to 50 slightly improves the accuracy, although not significantly. The model performs well with very few misclassifications.

Boosting with 100 estimators:

Accuracy: 0.9737
Confusion Matrix: [[70 1] [ 2 41]]
Using 100 estimators, the accuracy remains consistent and the model continues to perform well on the dataset.

Boosting with 200 estimators:
Accuracy: 0.9737
Confusion Matrix: [[70 1] [ 2 41]]
Increasing the number of estimators to 200 does not further improve the accuracy, suggesting that the model may have already reached its optimal performance.

### Dummy classifier

In [None]:
# Create a dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_pred = dummy_clf.predict(X_test)
dummy_accuracy = accuracy_score(y_test, dummy_pred)
print(f"Dummy classifier accuracy: {dummy_accuracy}")


In conclusion, the boosting classifier performed the best followed closely by the bagging classifier with SVC as the base estimator. The RandomForestClassifier-based bagging classifier also performed well but slightly lower than the previous two. The GradientBoostingClassifier-based bagging classifier showed slightly lower performance but still achieved respectable results.
 

this assignment has been done with help of Fatemeh Rakhshanifar