In [14]:
# Assignment 8
# Name: Saurabh Isane
# Roll No: 122B1F035

In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:

# Load data
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)


In [3]:

# 1. Remove low variance features
vt = VarianceThreshold(threshold=0.01)
X = pd.DataFrame(vt.fit_transform(X), columns=X.columns[vt.get_support()])



In [4]:

# 2. Remove multicollinear features
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
X.drop(columns=to_drop, inplace=True)


In [5]:

# 3. Polynomial features (only interactions to avoid explosion)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)


In [6]:

# 4. Scale features using RobustScaler (better with outliers)
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_poly)


In [7]:

# 5. PCA (retain 95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_reduced = pca.fit_transform(X_scaled)


In [8]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42, stratify=y)


In [9]:

# Define base models
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, random_state=42)
svc = SVC(probability=True, kernel='rbf', C=1.0, gamma='scale')


In [10]:

# Combine with stacking
stack = StackingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('svc', svc)],
    final_estimator=LogisticRegression(),
    passthrough=True
)


In [11]:

# Train model
stack.fit(X_train, y_train)


In [12]:

# Predict
y_pred = stack.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.93      0.95        42
           1       0.96      0.99      0.97        72

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Accuracy: 0.9649122807017544


In [13]:

# Optional: Cross-validation score
cv_score = cross_val_score(stack, X_reduced, y, cv=5).mean()
print("Cross-Validated Accuracy:", round(cv_score, 4))

Cross-Validated Accuracy: 0.9508
