In [1]:
# feature_engineering.py
# Feature engineering with polynomial and interaction terms

import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

def feature_engineering(X_train, X_test, num_cols):
    # Initialize PolynomialFeatures for interaction and polynomial terms
    poly_interaction = PolynomialFeatures(degree=3, include_bias=False, interaction_only=True)
    poly_full = PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)
    scaler = StandardScaler()

    # Create lists to hold new features
    new_train_features = []
    new_test_features = []

    for col in num_cols:
        if col in X_train.columns:
            # Interaction features
            train_interaction = poly_interaction.fit_transform(X_train[[col]])
            test_interaction = poly_interaction.transform(X_test[[col]])

            # Scale
            train_interaction_scaled = scaler.fit_transform(train_interaction)
            test_interaction_scaled = scaler.transform(test_interaction)

            # Create DataFrames for the interaction features
            train_interaction_df = pd.DataFrame(train_interaction_scaled, columns=[f"{col}_interact_{i}" for i in range(train_interaction_scaled.shape[1])])
            test_interaction_df = pd.DataFrame(test_interaction_scaled, columns=[f"{col}_interact_{i}" for i in range(test_interaction_scaled.shape[1])])
            # Append to the list
            new_train_features.append(train_interaction_df)
            new_test_features.append(test_interaction_df)


            # Polynomial features including interaction
            train_poly = poly_full.fit_transform(X_train[[col]])
            test_poly = poly_full.transform(X_test[[col]])

            # Scale
            train_poly_scaled = scaler.fit_transform(train_poly)
            test_poly_scaled = scaler.transform(test_poly)

            # Create DataFrames for the polynomial features
            train_poly_df = pd.DataFrame(train_poly_scaled, columns=[f"{col}_poly_{i}" for i in range(train_poly_scaled.shape[1])])
            test_poly_df = pd.DataFrame(test_poly_scaled, columns=[f"{col}_poly_{i}" for i in range(test_poly_scaled.shape[1])])

            # Append to the lists
            new_train_features.append(train_poly_df)
            new_test_features.append(test_poly_df)

    # Concatenate the new features to the original datasets
    X_train = pd.concat([X_train] + new_train_features, axis=1)
    X_test = pd.concat([X_test] + new_test_features, axis=1)

    return X_train, X_test

# Sample usage:
# X_train, X_test = feature_engineering(X_train, X_test, num_cols)