In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns

### Data Exploration and Understanding: 

In [None]:
# Load the dataset
data = pd.read_csv(r'../Data/breast-cancer.csv')

data.head() 

In [None]:
print(data.shape)
data.info()

In [None]:
# Check missing values
print(data.describe())
print("Missing values:", data.isnull().sum().sum())


In [None]:
# Visualization of the target variable distribution
sns.countplot(data['diagnosis'])
plt.show()

### Separate features and target variable

In [None]:
# Separate features and target variable
X = data.drop(['id','diagnosis'], axis=1)
y = data['diagnosis'] 


### Scale features

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Check the skewness of the features

In [None]:
# Check the skewness of the features 
Xscaled = pd.DataFrame(X_scaled)
Xscaled.skew() 

### Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xscaled, y)

The bagging classifier involves training multiple models in parallel with replacement and averaging their predictions, is useful for reducing variance.
It can help improve the stability and generalization of the models.

Boosting methods, are known for their ability to reduce bias and improve the overall performance of the models.

A Dummy Classifier is a type of classifier which does not generate any insight about the data and classifies the given data using only simple rules.
This classifier is useful as a simple baseline to compare with other (real) classifiers.

### Bagging classifier


In [None]:
# Bagging classifier with different estimators
estimators = [SVC(), RandomForestClassifier(),GradientBoostingClassifier()]
 
for estimator in estimators:
    clf = BaggingClassifier(base_estimator=estimator, n_estimators=10, random_state=0)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    report = classification_report(y_test, pred) 
    print(f"Bagging with {estimator.__class__.__name__} accuracy: {accuracy}")
    print(f"Classification Report:\n{report}\n")

### Boosting classifier

In [None]:
# Create a boosting classifier
parameters = {'n_estimators': [50, 100, 200]}
clf = GridSearchCV(AdaBoostClassifier(), parameters, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)
boosting_pred = clf.predict(X_test)
boosting_accuracy = accuracy_score(y_test, boosting_pred)
print(f"Boosting accuracy: {boosting_accuracy}")
print(confusion_matrix(y_test, boosting_pred))
print(classification_report(y_test, boosting_pred))

### Dummy classifier

In [None]:
# Create a dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_pred = dummy_clf.predict(X_test)
dummy_accuracy = accuracy_score(y_test, dummy_pred)
print(f"Dummy classifier accuracy: {dummy_accuracy}")


In conclusion, the boosting classifier performed the best followed closely by the bagging classifier with SVC as the base estimator. The RandomForestClassifier-based bagging classifier also performed well but slightly lower than the previous two. The GradientBoostingClassifier-based bagging classifier showed slightly lower performance but still achieved respectable results.
 

this assignment has been done with help of Fatemeh Rakhshanifar