# Phase 1: Data Preprocessing & Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
import os

In [2]:
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)

In [3]:
dataset_path = os.path.join(project_root, 'data','raw','marketing_campaign.csv')

In [4]:
# Load the dataset
df = pd.read_csv(dataset_path, sep='\t')  # Note: Adjust delimiter if needed
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


## Data Cleaning

In [5]:
# Handle missing values (e.g., Income)
df['Income'] = df['Income'].fillna(df['Income'].median())

# Remove outliers (e.g., Income > 99th percentile)
income_threshold = df['Income'].quantile(0.99)
df = df[df['Income'] <= income_threshold]

# Drop redundant columns (Z_CostContact, Z_Revenue are constants)
df = df.drop(columns=['Z_CostContact', 'Z_Revenue'])

## Feature Encoding

In [6]:
# Ordinal encoding for Education (assuming ordinality)
education_order = ['Basic', '2n Cycle', 'Graduation', 'Master', 'PhD']
df['Education'] = df['Education'].astype('category').cat.set_categories(education_order, ordered=True)
df['Education_encoded'] = df['Education'].cat.codes

# One-hot encoding for Marital_Status
df = pd.get_dummies(df, columns=['Marital_Status'], prefix='Marital')

## New Feature Creation

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
df = pd.read_csv('final_customer_data.csv')  # Replace this with the correct file path if needed

# Step 2: Check for missing values
print("Missing values before cleaning:\n", df.isnull().sum())
df = df.dropna()  # Drop rows with missing values (if any)
print("\nMissing values after cleaning:\n", df.isnull().sum())

# Step 3: Feature Engineering
# 1. Scale 'Income' and 'Total_Spend'
scaler = StandardScaler()

# Scaling 'Income' column
df['Income_scaled'] = scaler.fit_transform(df[['Income']])

# Create 'Total_Spend' as the sum of product purchases
# Based on your columns, you only have 'MntWines' for spending.
product_columns = ['MntWines']  # Assuming 'MntWines' is the column for spending.
df['Total_Spend'] = df[product_columns].sum(axis=1)

# Now, scale 'Total_Spend'
df['Total_Spend_scaled'] = scaler.fit_transform(df[['Total_Spend']])

# Step 4: Correlation Analysis
# We will calculate the correlation matrix for numerical features
numerical_features = ['Income_scaled', 'Total_Spend_scaled', 'Family_Size', 'Recency']

# Check correlation matrix and visualize it
correlation_matrix = df[numerical_features].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Step 5: Visualize the distribution of 'Income' and 'Total_Spend'
plt.figure(figsize=(12, 6))

# Distribution of Income_scaled
plt.subplot(1, 2, 1)
sns.histplot(df['Income_scaled'], kde=True, color='blue')
plt.title('Income Distribution (Scaled)')

# Distribution of Total_Spend_scaled
plt.subplot(1, 2, 2)
sns.histplot(df['Total_Spend_scaled'], kde=True, color='green')
plt.title('Total Spend Distribution (Scaled)')

plt.tight_layout()
plt.show()

# Step 6: Segmentation (Assuming clustering model already applied)
# Let's visualize the segments (assuming 'Cluster' column represents the cluster ID)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Income_scaled'], y=df['Total_Spend_scaled'], hue=df['Cluster'], palette='Set1')
plt.title('Segmentation: Income vs Total Spend')
plt.xlabel('Scaled Income')
plt.ylabel('Scaled Total Spend')
plt.legend(title='Cluster')
plt.show()

# Optional: Save the cleaned and processed dataset to a new CSV
df.to_csv('processed_customer_data.csv', index=False)

print("Data processing completed!")


## Scaling & Correlation

In [None]:
from sklearn.preprocessing import StandardScaler

# List your numerical features
numerical_features = ['Income', 'Customer_Tenure', 'Total_Spend', 'Family_Size', 'Recency']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Optional: Confirm scaling
print(df[numerical_features].describe())


## Documentation & Output 

In [None]:
# Save preprocessed data
df.to_csv('preprocessed_campaign_data.csv', index=False)

# Print summary of changes
print("Preprocessing Summary:")
print(f"- Missing values handled: Income imputed with median.")
print(f"- New features added: Customer_Tenure, Family_Size, Total_Spend.")
print(f"- Categorical features encoded: Education (ordinal), Marital_Status (one-hot).")
print(f"- Numerical features scaled: {numerical_features}")