In [14]:
#Load the libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
import numpy as np


# Data Loading
Load the dataset and show first few rows. 

In [15]:
# Load the dataset
file_path = './marketing_campaign.csv'
df = pd.read_csv(file_path, sep='\t')
df.head()


Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


# Create a Preprocessing Pipeline
 Chain multiple feature engineering and data preprocessing steps together in a single object to ensure that all steps are applied sequentially and that data leakage is avoided.

In [16]:
#Define columns for processing

columns_to_drop = ['ID', 'Year_Birth', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Marital_Status']
columns_to_exclude = ['Recency', 'AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','AcceptedCmp1','AcceptedCmp2','Complain', 'Response']
numerical_columns = ['Income','MntWines','MntFruits','MntMeatProducts','MntFishProducts','MntSweetProducts','MntGoldProds','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Z_CostContact','Z_Revenue', 'Age', 'Total_Children', 'Total_Household_Size', 'Total_Spending', 'Loyalty_Months', 'Campaign_Response_Rate', 'Shopping_Frequency']
categorical_columns = ['Education', 'Marital_Status_Grouped']

pipeline = Pipeline(steps=[
    ('feature_engineering', FunctionTransformer(
        lambda df: df.assign(
            Age=2024 - df['Year_Birth'],
            Total_Children=df['Kidhome'] + df['Teenhome'],  # Ensure this is correctly computed first
            Marital_Status_Grouped=df['Marital_Status'].map({
                'Single': 'Single_Household',
                'Divorced': 'Single_Household',
                'Widow': 'Single_Household',
                'Alone': 'Single_Household',
                'Together': 'Couples',
                'Married': 'Couples',
                'Absurd': 'Unknown',
                'YOLO': 'Unknown'
            }),
        ), validate=False)
    ),
    ('feature_engineering2', FunctionTransformer(
        lambda df: df.assign(
            Total_Household_Size=(
                (df['Marital_Status'].isin(['Together', 'Married'])).astype(int) + 1 + df['Total_Children']
            ),
            Total_Spending=df[['MntWines', 'MntFruits', 'MntMeatProducts', 
                               'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1),
            Loyalty_Months=((pd.to_datetime('today') - pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')).dt.days // 30),
            Campaign_Response_Rate=df[['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']].sum(axis=1) / 5,
            Shopping_Frequency=df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].sum(axis=1)
        ).drop(columns_to_drop, axis=1), validate=False)
    ), 
    ('preprocessing', ColumnTransformer(
            transformers=[
                ('num', Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),  # Fill missing numerical values with median
                    ('scaler', StandardScaler())  # Scale numerical columns
                ]), numerical_columns),
                
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns) #One hot encoding for cat columns
            ], 
            remainder='passthrough'
        ))
])

pipeline


In [18]:
df_featured = pipeline.fit_transform(df)

# Get column names
transformer = pipeline.named_steps['preprocessing']
numerical_names = numerical_columns
categorical_encoded_names = transformer.named_transformers_['cat'].get_feature_names_out(categorical_columns)

# Combine all names
final_columns = numerical_names + list(categorical_encoded_names) + columns_to_exclude

processed_df = pd.DataFrame(df_featured, columns=final_columns)

processed_df.to_csv('marketing_campaign_processed_with_pipeline.csv', index=False)  # Save to CSV

display(processed_df)


Unnamed: 0,Income,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,...,Marital_Status_Grouped_Single_Household,Marital_Status_Grouped_Unknown,Recency,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,0.235696,0.983781,1.551577,1.679702,2.462147,1.476500,0.843207,0.349414,1.409304,2.510890,...,1.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.235454,-0.870479,-0.636301,-0.713225,-0.650449,-0.631503,-0.729006,-0.168236,-1.110409,-0.568720,...,1.0,0.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.773999,0.362723,0.570804,-0.177032,1.345274,-0.146905,-0.038766,-0.685887,1.409304,-0.226541,...,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.022355,-0.870479,-0.560857,-0.651187,-0.503974,-0.583043,-0.748179,-0.168236,-0.750450,-0.910898,...,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.241888,-0.389085,0.419916,-0.216914,0.155164,-0.001525,-0.556446,1.384715,0.329427,0.115638,...,0.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,0.358936,1.203678,0.419916,0.066692,0.081926,2.203398,3.891766,-0.168236,1.769263,0.115638,...,0.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2236,0.470432,0.303291,-0.661449,-0.606873,-0.687068,-0.655733,-0.690659,2.420015,1.409304,-0.226541,...,0.0,0.0,56.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2237,0.189476,1.795020,0.545656,0.221789,-0.101168,-0.364974,-0.383886,-0.685887,-0.750450,0.115638,...,1.0,0.0,91.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2238,0.679401,0.368666,0.092992,0.208495,0.777683,0.071165,0.325527,-0.168236,0.689386,0.799996,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
