In [1]:
#Imports & Data Load 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import mlflow
import mlflow.sklearn


In [2]:
# Deteck Skew + Highly-Correlated Pairs 

# Load dataset
data = load_breast_cancer(as_frame=True)
df   = data.frame.copy()
df['target'] = data.target

# --- Skewness ---
skew = df.drop(columns='target').skew().sort_values(ascending=False)
skewed_feats = skew[skew.abs() > 1.0].index.tolist()      # |skew| > 1 = high skew
print("Skewed features:", skewed_feats)

# --- Correlation ---
corr = df.drop(columns='target').corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_corr_pairs = [(col, row) for col in upper.columns
                   for row in upper.index
                   if (upper.loc[row, col] > 0.9)]
print("Highly-correlated (>0.9) pairs:", high_corr_pairs)


Skewed features: ['area error', 'concavity error', 'fractal dimension error', 'perimeter error', 'radius error', 'smoothness error', 'symmetry error', 'compactness error', 'worst area', 'worst fractal dimension', 'texture error', 'mean area', 'worst compactness', 'concave points error', 'worst symmetry', 'mean concavity', 'mean fractal dimension', 'mean compactness', 'mean concave points', 'worst concavity', 'worst perimeter', 'worst radius']
Highly-correlated (>0.9) pairs: [('mean perimeter', 'mean radius'), ('mean area', 'mean radius'), ('mean area', 'mean perimeter'), ('mean concave points', 'mean concavity'), ('perimeter error', 'radius error'), ('area error', 'radius error'), ('area error', 'perimeter error'), ('worst radius', 'mean radius'), ('worst radius', 'mean perimeter'), ('worst radius', 'mean area'), ('worst texture', 'mean texture'), ('worst perimeter', 'mean radius'), ('worst perimeter', 'mean perimeter'), ('worst perimeter', 'mean area'), ('worst perimeter', 'worst radi

In [3]:
# Define Pre-processor 

numeric = df.drop(columns='target').columns.tolist()

# Pipeline: (1) PowerTransformer for skewed columns
#           (2) StandardScaler for all numeric
pre = ColumnTransformer(
    transformers=[
        ("skew_fix", PowerTransformer(method="yeo-johnson"),
         skewed_feats),
        ("scale", StandardScaler(), numeric)
    ],
    remainder="passthrough"
)


In [4]:
# Training + MLflow Run
X = df.drop(columns='target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model
rf = RandomForestClassifier(
    n_estimators=400, max_depth=None, random_state=42)

pipe = Pipeline(steps=[("pre", pre), ("model", rf)])

mlflow.sklearn.autolog()

with mlflow.start_run(run_name="feature_engineered_rf"):
    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:, 1]
    auc   = roc_auc_score(y_test, proba)
    mlflow.log_metric("roc_auc", auc)

print("New AUC:", auc)


New AUC: 0.9930555555555556


In [6]:
# Remove one column from each high-corr pair
cols_to_drop = {pair[1] for pair in high_corr_pairs}
X_reduced    = X.drop(columns=list(cols_to_drop))

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42, stratify=y
)

# Adjust skewed_feats and numeric for reduced columns
skewed_feats_r = [col for col in skewed_feats if col in X_reduced.columns]
numeric_r = [col for col in numeric if col in X_reduced.columns]

pre_r = ColumnTransformer(
    transformers=[
        ("skew_fix", PowerTransformer(method="yeo-johnson"), skewed_feats_r),
        ("scale", StandardScaler(), numeric_r)
    ],
    remainder="passthrough"
)

pipe_r = Pipeline(steps=[
    ("pre", pre_r),
    ("model", rf)
])

with mlflow.start_run(run_name="reduced_features_rf"):
    pipe_r.fit(X_train_r, y_train_r)
    proba_r = pipe_r.predict_proba(X_test_r)[:, 1]
    auc_r   = roc_auc_score(y_test_r, proba_r)
    mlflow.log_metric("roc_auc", auc_r)

print("Reduced-feature AUC:", auc_r)


Reduced-feature AUC: 0.9947089947089948
