<a href="https://colab.research.google.com/github/poojavjpy/Data-Analytics/blob/main/enhance_model_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create and select features to enhance model performance for analytics tasks.**bold text**

In [1]:
"""
Feature engineering and feature selection demo (classification)
- Uses sklearn breast cancer dataset
- Creates new features (polynomial interactions, log transforms, bins)
- Demonstrates preprocessing pipeline (impute, scale)
- Compares model performance (cross-validation) for:
  * baseline (all features)
  * SelectKBest (f_classif)
  * Recursive Feature Elimination (RFE)
  * Model-based selection (RandomForest importance -> SelectFromModel)

Run: python feature_engineering_and_selection.py
"""

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, KBinsDiscretizer, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


def create_engineered_features(df):
    """Add a few engineered features to illustrate feature creation."""
    # 1. Interaction term: mean radius * mean texture
    df['radius_texture_interaction'] = df['mean radius'] * df['mean texture']

    # 2. Polynomial feature (square) of mean perimeter
    df['perimeter_sq'] = df['mean perimeter'] ** 2

    # 3. Log transform (safe) of mean area
    df['log_area'] = np.log1p(df['mean area'])

    # 4. Binned categorical from mean smoothness (3 bins)
    kb = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
    df['smoothness_bin'] = kb.fit_transform(df[['mean smoothness']]).astype(int)

    # 5. Ratio feature (concavity / concave points) with safe-guard
    eps = 1e-6
    df['concavity_over_concave_points'] = df['mean concave points'] / (df['mean concavity'] + eps)

    return df


def build_preprocessor(numeric_features):
    numeric_transformer = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features)
    ], remainder='drop')

    return preprocessor


def evaluate_pipeline(pipeline, X, y, cv):
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    return scores.mean(), scores.std()


def main():
    # Load dataset
    data = load_breast_cancer(as_frame=True)
    X = data.frame.drop(columns=['target'])
    y = data.target

    # Start from original features
    df = X.copy()

    # Create engineered features
    df = create_engineered_features(df)

    # Feature list
    all_features = list(df.columns)

    # Preprocessor
    preprocessor = build_preprocessor(all_features)

    # Classifier used for evaluation (stable baseline)
    clf = RandomForestClassifier(n_estimators=200, random_state=42)

    # Baseline pipeline (all features)
    pipeline_baseline = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    print('Evaluating baseline (all features)...')
    baseline_mean, baseline_std = evaluate_pipeline(pipeline_baseline, df, y, cv)
    print(f'Baseline accuracy: {baseline_mean:.4f} +/- {baseline_std:.4f}\n')

    # 1) Filter method: SelectKBest (ANOVA f-test)
    k = 12
    skb = SelectKBest(score_func=f_classif, k=k)
    pipeline_skb = Pipeline(steps=[('pre', preprocessor), ('skb', skb), ('clf', clf)])
    print(f'Evaluating SelectKBest (f_classif), k={k}...')
    skb_mean, skb_std = evaluate_pipeline(pipeline_skb, df, y, cv)
    print(f'SelectKBest accuracy: {skb_mean:.4f} +/- {skb_std:.4f}\n')

    # Fit skb to get selected feature names
    preprocessed = preprocessor.fit_transform(df)
    skb.fit(preprocessed, y)
    # Note: ColumnTransformer drops column names; we'll compute scores using preprocessor on dataframe columns
    # Simpler: compute scores using raw df columns (since our preprocessor only scales)
    skb_feature_indices = skb.get_support(indices=True)
    selected_by_skb = [all_features[i] for i in skb_feature_indices]
    print('SelectKBest selected features:', selected_by_skb, '\n')

    # 2) Wrapper method: RFE with LogisticRegression
    base_lr = LogisticRegression(solver='liblinear', max_iter=200)
    rfe = RFE(estimator=base_lr, n_features_to_select=10, step=1)
    pipeline_rfe = Pipeline(steps=[('pre', preprocessor), ('rfe', rfe), ('clf', clf)])
    print('Evaluating RFE (LogisticRegression) selecting 10 features...')
    rfe_mean, rfe_std = evaluate_pipeline(pipeline_rfe, df, y, cv)
    print(f'RFE accuracy: {rfe_mean:.4f} +/- {rfe_std:.4f}\n')

    # Fit RFE on full data to get selected features
    # Because RFE expects numeric array, transform first
    preproc_X = preprocessor.fit_transform(df)
    rfe.fit(preproc_X, y)
    rfe_support = rfe.get_support()
    selected_by_rfe = [all_features[i] for i, keep in enumerate(rfe_support) if keep]
    print('RFE selected features:', selected_by_rfe, '\n')

    # 3) Embedded / model-based selection: RandomForest importance -> SelectFromModel
    selector = SelectFromModel(estimator=RandomForestClassifier(n_estimators=300, random_state=0), threshold='median')
    pipeline_sfm = Pipeline(steps=[('pre', preprocessor), ('sfm', selector), ('clf', clf)])
    print('Evaluating SelectFromModel (RandomForest importance, threshold=median) ...')
    sfm_mean, sfm_std = evaluate_pipeline(pipeline_sfm, df, y, cv)
    print(f'SelectFromModel accuracy: {sfm_mean:.4f} +/- {sfm_std:.4f}\n')

    # Fit selector to get selected features names
    selector.fit(preproc_X, y)
    sfm_support = selector.get_support()
    selected_by_sfm = [all_features[i] for i, keep in enumerate(sfm_support) if keep]
    print('SelectFromModel selected features:', selected_by_sfm, '\n')

    # Summary table
    results = pd.DataFrame({
        'method': ['baseline_all', 'select_kbest', 'rfe', 'select_from_model'],
        'mean_accuracy': [baseline_mean, skb_mean, rfe_mean, sfm_mean],
        'std_accuracy': [baseline_std, skb_std, rfe_std, sfm_std]
    })

    print('\nSummary results:')
    print(results.sort_values('mean_accuracy', ascending=False).to_string(index=False))

    # Notes for further improvements
    print('\nNotes:')
    print('- Try recursive selection + hyperparameter tuning of the estimator for best results.')
    print('- Try different feature creation ideas (domain-specific) and interaction terms.')
    print('- For high-dimensional data, consider L1-based selection or dimensionality reduction (PCA).')


if __name__ == '__main__':
    main()


Evaluating baseline (all features)...
Baseline accuracy: 0.9649 +/- 0.0055

Evaluating SelectKBest (f_classif), k=12...
SelectKBest accuracy: 0.9543 +/- 0.0151

SelectKBest selected features: ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concave points', 'radius_texture_interaction', 'perimeter_sq', 'log_area'] 

Evaluating RFE (LogisticRegression) selecting 10 features...
RFE accuracy: 0.9508 +/- 0.0180

RFE selected features: ['mean concave points', 'radius error', 'area error', 'compactness error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points'] 

Evaluating SelectFromModel (RandomForest importance, threshold=median) ...
SelectFromModel accuracy: 0.9561 +/- 0.0200

SelectFromModel selected features: ['mean radius', 'mean perimeter', 'mean area', 'mean compactness', 'mean concavity', 'mean concave points', 'radius error', 'area