# This script incorporates insights from our EDA:

- **Lagged features for Spend and Clicks**:  
  These were included as they showed significant correlations with Revenue.

- **Efficiency metrics**:  
  Added metrics like CPC, CTR, CVR, and ROAS as separate features to capture the efficiency of each campaign.

- **Relative performance features**:  
  Created features that compare each campaign’s performance to the mean performance for its source.

- **'is_high_spend_day' feature**:  
  This feature was added based on the results from the anomaly detection, identifying days with unusually high spend.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load the data
data = pd.read_csv('../data/processed/combined_ad_data.csv')
data['Date'] = pd.to_datetime(data['Date'])

In [4]:
def create_time_features(df):
    df['day_of_week'] = df['Date'].dt.dayofweek
    df['is_weekend'] = df['Date'].dt.dayofweek.isin([5,6]).astype(int)
    df['month'] = df['Date'].dt.month
    df['quarter'] = df['Date'].dt.quarter
    return df

def create_lagged_features(df):
    # Based on the correlation analysis, we'll create lagged features for Spend and Clicks
    for col in ['Spend', 'Clicks']:
        for lag in [1, 7, 30]:
            df[f'{col}_lag_{lag}'] = df.groupby('Source')[col].shift(lag)
    return df

def create_rolling_averages(df):
    for col in ['Spend', 'Clicks', 'Impressions', 'Conversions', 'Revenue']:
        for window in [7, 30]:
            df[f'{col}_rolling_{window}d'] = df.groupby('Source')[col].rolling(window=window).mean().reset_index(0,drop=True)
    return df

def create_efficiency_features(df):
    df['CPC'] = df['Spend'] / df['Clicks']
    df['CTR'] = df['Clicks'] / df['Impressions']
    df['CVR'] = df['Conversions'] / df['Clicks']
    df['ROAS'] = df['Revenue'] / df['Spend']
    return df

def create_relative_performance_features(df):
    # Create features that compare performance to the mean of each source
    for metric in ['CTR', 'CVR', 'ROAS']:
        df[f'{metric}_vs_mean'] = df.groupby('Source')[metric].transform(lambda x: x / x.mean())
    return df

def encode_categorical_features(df):
    le = LabelEncoder()
    df['Source_encoded'] = le.fit_transform(df['Source'])
    df['Campaign_type_encoded'] = le.fit_transform(df['Campaign type'])
    return df

def create_anomaly_feature(df):
    # Based on the anomaly detection results, create a feature for high spend days
    df['is_high_spend_day'] = (df['Spend'] > df['Spend'].quantile(0.95)).astype(int)
    return df

In [5]:
# Apply feature engineering
data = create_time_features(data)
data = create_lagged_features(data)
data = create_rolling_averages(data)
data = create_efficiency_features(data)
data = create_relative_performance_features(data)
data = encode_categorical_features(data)
data = create_anomaly_feature(data)

# Remove rows with NaN values created by lagged features
data = data.dropna()

# Save the feature-engineered dataset
data.to_csv('../data/processed/feature_engineered_data.csv', index=False)

print(data.columns)
print(data.head())

Index(['Date', 'Campaign type', 'Impressions', 'Clicks', 'Spend',
       'Conversions', 'Revenue', 'Source', 'Reach', 'Channel', 'Sessions',
       'Website Conversions', 'CTR', 'CPC', 'CVR', 'ROAS', 'day_of_week',
       'is_weekend', 'month', 'quarter', 'Spend_lag_1', 'Spend_lag_7',
       'Spend_lag_30', 'Clicks_lag_1', 'Clicks_lag_7', 'Clicks_lag_30',
       'Spend_rolling_7d', 'Spend_rolling_30d', 'Clicks_rolling_7d',
       'Clicks_rolling_30d', 'Impressions_rolling_7d',
       'Impressions_rolling_30d', 'Conversions_rolling_7d',
       'Conversions_rolling_30d', 'Revenue_rolling_7d', 'Revenue_rolling_30d',
       'CTR_vs_mean', 'CVR_vs_mean', 'ROAS_vs_mean', 'Source_encoded',
       'Campaign_type_encoded', 'is_high_spend_day'],
      dtype='object')
         Date   Campaign type  Impressions  Clicks   Spend  Conversions  \
30 2024-01-08  Search Network       3655.0   437.0   947.8         54.6   
31 2024-01-08         YouTube      38573.0     5.0   119.6          0.0   
32 2024

- The new dataset could be used for model development