In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

let's load the first dataset as an example:

In [None]:
from sklearn.datasets import load_iris, load_breast_cancer, load_diabetes, load_wine
from sklearn.model_selection import train_test_split

# data = load_iris()
data = load_breast_cancer()
# data = load_wine()
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(data['data']), data['target'], random_state=42)
X_train

lets create an automatic feature engineering class:

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AutoFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.numerical_cols = []
        self.categorical_cols = []
        self.preprocessor = None
    
    def fit(self, X, y=None):
        # Separate numerical and categorical columns
        self.numerical_cols = X.select_dtypes(include=["float64", "int64"]).columns
        self.categorical_cols = X.select_dtypes(include=["object"]).columns

        # Define preprocessing steps for numerical features
        numerical_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", MinMaxScaler()),
            ('poly', PolynomialFeatures(degree=2))
        ])

        # Define preprocessing steps for categorical features
        categorical_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ])

        # Combine preprocessing steps for all features
        self.preprocessor = ColumnTransformer(transformers=[
            ("num", numerical_transformer, self.numerical_cols),
            ("cat", categorical_transformer, self.categorical_cols)
        ])

        # Fit the preprocessor to the data
        self.preprocessor.fit(X)

        return self
    
    def transform(self, X):
        X_transformed = self.preprocessor.transform(X)
        return X_transformed


lets create an automatic feature selection class:

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, metric=accuracy_score):
        self.metric = metric
    
    def fit(self, X, y):
        n_features = X.shape[1]
        
        # Initialize SelectKBest and SelectFromModel transformers
        skb = SelectKBest(chi2, k=int(0.9 * n_features))
        sfm = SelectFromModel(RandomForestClassifier(n_estimators=1, random_state=0))
        
        # Fit both transformers to the data and calculate accuracy scores
        skb.fit(X, y)
        X_skb = skb.transform(X)
        acc_skb = self.metric(y, RandomForestClassifier(n_estimators=1, random_state=0).fit(X_skb, y).predict(X_skb))
        
        sfm.fit(X, y)
        X_sfm = sfm.transform(X)
        acc_sfm = self.metric(y, RandomForestClassifier(n_estimators=1, random_state=0).fit(X_sfm, y).predict(X_sfm))
        
        # Choose the better transformer based on accuracy score
        if acc_skb >= acc_sfm:
            print("Selecting features according to the k highest scores.")
            self.transformer = skb
        else:
            print("selecting features based on importance weights.")
            self.transformer = sfm
        
        # Fit the chosen transformer to the data
        self.transformer.fit(X, y)
        return self
    
    def transform(self, X):
        return self.transformer.transform(X)

we perform auto features engineering on both the train and the test datasets using our AutoFeatureEngineer class:

In [None]:
auto_fe = AutoFeatureEngineer()
X_train_transformed = auto_fe.fit_transform(X_train)
X_test_transformed = auto_fe.transform(X_test)

X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=auto_fe.preprocessor.get_feature_names_out())
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=auto_fe.preprocessor.get_feature_names_out())
X_train_transformed_df

training a RandomForest Classifier on the baseline data, before performing features engineering or selection:

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=1, random_state=0)
forest.fit(X_train, y_train)
pred = forest.predict(X_test)
score = sum(pred==y_test)/len(y_test)
print("The acuuracy of the initial model is:", score)

now lets train the same classifier on the training set after performing feature engineering trasformation on the data:

In [None]:
forest.fit(X_train_transformed, y_train)
pred_fe = forest.predict(X_test_transformed)
score = sum(pred_fe==y_test)/len(y_test)
print("The accuracy of the model after arithmetic fe is:", score)

lets see the performance of the classifier on the baseline data:

In [None]:
report = classification_report(y_test, pred)
print(report)

and here are the results for the tranformed data after feature engineering operations:

In [None]:
report = classification_report(y_test, pred_fe)
print(report)

In [None]:
# confusion matrix for the initial model:
cm_arr = confusion_matrix(y_test, pred)
sns.heatmap(cm_arr, cmap='YlGnBu', annot=True, fmt="d").set_title('Confusion Matrix for the initial model')
plt.show()

In [None]:
# confusion matrix for the improved model:
cm_arr = confusion_matrix(y_test, pred_fe)
sns.heatmap(cm_arr, cmap='YlGnBu', annot=True, fmt="d").set_title('Confusion Matrix for the improved model')
plt.show()

lets use the FeatureSelector class and perform feature selection operation:

In [None]:
selector = FeatureSelector()
X_train_selected = selector.fit_transform(X_train_transformed, y_train)
X_test_selected = selector.transform(X_test_transformed)

In [None]:
forest.fit(X_train_selected, y_train)
pred_fe_fs = forest.predict(X_test_selected)
score = sum(pred_fe_fs==y_test)/len(y_test)
print("The accuracy of the model feature selection:", score)