In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

2024-09-09 14:10:19.622678: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load the preprocessed data
data = pd.read_csv('../data/processed/feature_engineered_data.csv')

In [4]:
# Define features and target
features = ['Impressions', 'Clicks', 'Spend', 'CTR', 'CPC', 'CVR', 
            'day_of_week', 'is_weekend', 'month', 'quarter', 
            'Spend_lag_1', 'Spend_lag_7', 'Spend_lag_30', 
            'Clicks_lag_1', 'Clicks_lag_7', 'Clicks_lag_30',
            'Spend_rolling_7d', 'Spend_rolling_30d', 
            'Clicks_rolling_7d', 'Clicks_rolling_30d',
            'Impressions_rolling_7d', 'Impressions_rolling_30d',
            'Conversions_rolling_7d', 'Conversions_rolling_30d',
            'Revenue_rolling_7d', 'Revenue_rolling_30d',
            'CTR_vs_mean', 'CVR_vs_mean', 'Source_encoded', 
            'Campaign_type_encoded', 'is_high_spend_day']

target = 'ROAS'

# Prepare the data
X = data[features]
y = data[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Define a function to evaluate models
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("--------------------")

# Linear Regression (baseline)
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_val_scaled)
evaluate_model(y_val, lr_pred, "Linear Regression")

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_val_scaled)
evaluate_model(y_val, rf_pred, "Random Forest")

# XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_val_scaled)
evaluate_model(y_val, xgb_pred, "XGBoost")

# Neural Network
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)
])
nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_data=(X_val_scaled, y_val), verbose=0)
nn_pred = nn_model.predict(X_val_scaled).flatten()
evaluate_model(y_val, nn_pred, "Neural Network")

Linear Regression Performance:
RMSE: 18.3742
MAE: 8.5390
R-squared: 0.1176
--------------------
Random Forest Performance:
RMSE: 28.3910
MAE: 5.3165
R-squared: -1.1067
--------------------
XGBoost Performance:
RMSE: 137.1607
MAE: 13.1039
R-squared: -48.1695
--------------------
Neural Network Performance:
RMSE: 14.3990
MAE: 3.5969
R-squared: 0.4581
--------------------


Based on these, we can see that the Neural Network model is performing the best, followed by Linear Regression. The Random Forest and XGBoost models are underperforming, which suggests they might be overfitting or need further tuning.

- Implement cross-validation using TimeSeriesSplit
- Perform hyperparameter tuning for the Neural Network and Linear Regression models
- Conduct feature importance analysis
- Evaluate the best model on the test set


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scipy.stats import uniform, randint

# Load the preprocessed data
data = pd.read_csv('../data/processed/feature_engineered_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

# Define features and target
features = ['Impressions', 'Clicks', 'Spend', 'CTR', 'CPC', 'CVR', 
            'day_of_week', 'is_weekend', 'month', 'quarter', 
            'Spend_lag_1', 'Spend_lag_7', 'Spend_lag_30', 
            'Clicks_lag_1', 'Clicks_lag_7', 'Clicks_lag_30',
            'Spend_rolling_7d', 'Spend_rolling_30d', 
            'Clicks_rolling_7d', 'Clicks_rolling_30d',
            'Impressions_rolling_7d', 'Impressions_rolling_30d',
            'Conversions_rolling_7d', 'Conversions_rolling_30d',
            'Revenue_rolling_7d', 'Revenue_rolling_30d',
            'CTR_vs_mean', 'CVR_vs_mean', 'Source_encoded', 
            'Campaign_type_encoded', 'is_high_spend_day']

target = 'ROAS'

X = data[features]
y = data[target]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Define evaluation function
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("--------------------")

# Linear Regression with cross-validation
lr_model = LinearRegression()
lr_scores = []

for train_index, val_index in tscv.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    lr_model.fit(X_train, y_train)
    lr_pred = lr_model.predict(X_val)
    lr_scores.append(r2_score(y_val, lr_pred))

print(f"Linear Regression Cross-Validation Scores: {lr_scores}")
print(f"Mean R-squared: {np.mean(lr_scores):.4f}")
print("--------------------")

# Neural Network with manual hyperparameter tuning
def create_model(neurons, dropout_rate, learning_rate):
    model = Sequential([
        Dense(neurons, activation='relu', input_shape=(X_scaled.shape[1],)),
        Dropout(dropout_rate),
        Dense(neurons // 2, activation='relu'),
        Dropout(dropout_rate),
        Dense(neurons // 4, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model

# Hyperparameter options
param_options = {
    'neurons': [32, 64, 128],
    'dropout_rate': [0.1, 0.2, 0.3],
    'learning_rate': [0.001, 0.01, 0.1]
}

best_score = float('inf')
best_params = {}

for neurons in param_options['neurons']:
    for dropout_rate in param_options['dropout_rate']:
        for learning_rate in param_options['learning_rate']:
            nn_scores = []
            for train_index, val_index in tscv.split(X_scaled):
                X_train, X_val = X_scaled[train_index], X_scaled[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                
                model = create_model(neurons, dropout_rate, learning_rate)
                model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_data=(X_val, y_val))
                
                y_pred = model.predict(X_val).flatten()
                mse = mean_squared_error(y_val, y_pred)
                nn_scores.append(mse)
            
            mean_mse = np.mean(nn_scores)
            if mean_mse < best_score:
                best_score = mean_mse
                best_params = {'neurons': neurons, 'dropout_rate': dropout_rate, 'learning_rate': learning_rate}

print("Best Neural Network Parameters:")
print(best_params)
print(f"Best RMSE: {np.sqrt(best_score):.4f}")
print("--------------------")

# Feature Importance Analysis (using the best Neural Network model)
best_nn_model = create_model(**best_params)
best_nn_model.fit(X_scaled, y, epochs=100, batch_size=32, verbose=0)

feature_importance = np.abs(best_nn_model.layers[0].get_weights()[0].mean(axis=1))
feature_importance = feature_importance / np.sum(feature_importance)
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance_df.head(10))
print("--------------------")

# Evaluate best model on test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
best_nn_model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)
y_pred = best_nn_model.predict(X_test).flatten()

evaluate_model(y_test, y_pred, "Best Neural Network (Test Set)")

Linear Regression Cross-Validation Scores: [-0.29102876560856594, -1.2168742882080883, 0.43280929426266135, 0.48054674036703116, 0.20436899359622762]
Mean R-squared: -0.0780
--------------------
Best Neural Network Parameters:
{'neurons': 128, 'dropout_rate': 0.1, 'learning_rate': 0.01}
Best RMSE: 13.3114
--------------------
Top 10 Most Important Features:
                   Feature  Importance
19      Clicks_rolling_30d    0.087474
18       Clicks_rolling_7d    0.059165
17       Spend_rolling_30d    0.058906
28          Source_encoded    0.050993
24      Revenue_rolling_7d    0.050464
20  Impressions_rolling_7d    0.049809
13            Clicks_lag_1    0.045962
16        Spend_rolling_7d    0.044213
1                   Clicks    0.041158
10             Spend_lag_1    0.040778
--------------------
Best Neural Network (Test Set) Performance:
RMSE: 10.4508
MAE: 3.6500
R-squared: 0.6219
--------------------


Key Insights:

1. The neural network outperforms linear regression, suggesting non-linear relationships in the data.
2. Rolling metrics (7-day and 30-day) are crucial for predicting ROAS, indicating the importance of recent trends.
3. The model explains about 62% of the variance in ROAS (R-squared of 0.6219), which is good but leaves room for improvement.
4. The Source_encoded feature is important, suggesting that different ad platforms have varying impacts on ROAS.

Next Steps:

1. Feature Engineering: Create more complex features, especially focusing on interactions between top features.
2. Ensemble Methods: Try combining multiple models (e.g., Random Forest, XGBoost) with the neural network.
3. Advanced Architectures: Experiment with more complex neural network architectures, possibly including LSTM layers to capture temporal dependencies better.
4. Regularization: Implement regularization techniques to prevent overfitting and potentially improve generalization.
5. Anomaly Detection: Investigate outliers or anomalous periods in the data that might be affecting model performance.
6. Segmentation: Consider creating separate models for different ad sources or campaign types if they exhibit significantly different patterns.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
import xgboost as xgb

# Load and preprocess data (assuming this step is already done)
data = pd.read_csv('../data/processed/feature_engineered_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

# Define features and target
features = ['Impressions', 'Clicks', 'Spend', 'CTR', 'CPC', 'CVR', 
            'day_of_week', 'is_weekend', 'month', 'quarter', 
            'Spend_lag_1', 'Spend_lag_7', 'Spend_lag_30', 
            'Clicks_lag_1', 'Clicks_lag_7', 'Clicks_lag_30',
            'Spend_rolling_7d', 'Spend_rolling_30d', 
            'Clicks_rolling_7d', 'Clicks_rolling_30d',
            'Impressions_rolling_7d', 'Impressions_rolling_30d',
            'Conversions_rolling_7d', 'Conversions_rolling_30d',
            'Revenue_rolling_7d', 'Revenue_rolling_30d',
            'CTR_vs_mean', 'CVR_vs_mean', 'Source_encoded', 
            'Campaign_type_encoded', 'is_high_spend_day']

target = 'ROAS'

X = data[features]
y = data[target]

# Feature Engineering
X['Click_Spend_Ratio'] = X['Clicks'] / (X['Spend'] + 1)  # Adding 1 to avoid division by zero
X['Revenue_per_Click'] = data['Revenue'] / (X['Clicks'] + 1)
X['Spend_Acceleration'] = X['Spend_rolling_7d'] - X['Spend_rolling_30d'] / 30 * 7
X['Click_Acceleration'] = X['Clicks_rolling_7d'] - X['Clicks_rolling_30d'] / 30 * 7

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Time Series Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("--------------------")

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
evaluate_model(y_test, rf_pred, "Random Forest")

# XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
evaluate_model(y_test, xgb_pred, "XGBoost")

# Advanced Neural Network with LSTM and Regularization
def create_lstm_model(input_shape):
    model = Sequential([
        LSTM(64, input_shape=(input_shape[1], input_shape[2]), return_sequences=True, 
             kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        Dropout(0.2),
        LSTM(32, kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        Dropout(0.2),
        Dense(16, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

# Reshape input for LSTM (samples, time steps, features)
X_train_lstm = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

lstm_model = create_lstm_model(X_train_lstm.shape)
lstm_model.fit(X_train_lstm, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
lstm_pred = lstm_model.predict(X_test_lstm).flatten()
evaluate_model(y_test, lstm_pred, "LSTM Neural Network")

# Ensemble (Simple average of RF, XGB, and LSTM predictions)
ensemble_pred = (rf_pred + xgb_pred + lstm_pred) / 3
evaluate_model(y_test, ensemble_pred, "Ensemble Model")

# Feature Importance (using Random Forest)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features (Random Forest):")
print(feature_importance.head(15))
print("--------------------")

# Analyze model performance across different ad sources
source_performance = pd.DataFrame({
    'Actual': y_test,
    'Predicted': ensemble_pred,
    'Source': data.loc[y_test.index, 'Source']
})

for source in source_performance['Source'].unique():
    source_data = source_performance[source_performance['Source'] == source]
    print(f"Performance for {source}:")
    evaluate_model(source_data['Actual'], source_data['Predicted'], source)


2024-09-10 10:21:05.949281: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Click_Spend_Ratio'] = X['Clicks'] / (X['Spend'] + 1)  # Adding 1 to avoid division by zero
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Revenue_per_Click'] = data['Revenue

Random Forest Performance:
RMSE: 6.2354
MAE: 1.9500
R-squared: 0.8654
--------------------
XGBoost Performance:
RMSE: 3.2188
MAE: 1.3627
R-squared: 0.9641
--------------------


  super().__init__(**kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step
LSTM Neural Network Performance:
RMSE: 3.5239
MAE: 2.3496
R-squared: 0.9570
--------------------
Ensemble Model Performance:
RMSE: 3.1279
MAE: 1.4910
R-squared: 0.9661
--------------------
Top 15 Most Important Features (Random Forest):
                    feature  importance
5                       CVR    0.349262
27              CVR_vs_mean    0.238863
32        Revenue_per_Click    0.201282
0               Impressions    0.093412
4                       CPC    0.040365
17        Spend_rolling_30d    0.024994
2                     Spend    0.014764
31        Click_Spend_Ratio    0.007553
25      Revenue_rolling_30d    0.004006
34       Click_Acceleration    0.002602
23  Conversions_rolling_30d    0.002353
16         Spend_rolling_7d    0.002338
28           Source_encoded    0.001639
14             Clicks_lag_7    0.001581
19       Clicks_rolling_30d    0.001502
--------------------
Performance for Microsoft Ads:

Key Insights and Next Steps:

1. Model Selection: The Ensemble Model performs best, but XGBoost is nearly as good and might be preferred for its simplicity and interpretability.
2. Feature Focus: Prioritize efforts on improving CVR, understanding how campaigns compare to average CVR, and optimizing Revenue per Click.
3. Google Ads Performance: Investigate why the model performs poorly for Google Ads. Possible reasons:
   - Different dynamics or patterns in Google Ads data
   - Insufficient or unrepresentative Google Ads data in the training set
   -Presence of outliers or anomalies in Google Ads data
4. Meta Ads: While the R-squared is high, the RMSE is higher than for other platforms. This suggests more volatile or harder-to-predict ROAS for Meta Ads.
5. Feature Engineering: Consider creating more features related to CVR and revenue, as these seem to be highly predictive.
6. Hyperparameter Tuning: Fine-tune the XGBoost and Random Forest models to potentially improve performance further.
7. Time-based Analysis: Investigate if there are seasonal trends or time-based patterns that could be incorporated into the model.
8. Segmentation: Consider building separate models for each ad platform, especially given the poor performance on Google Ads.

Next steps:

1. Dive deeper into the Google Ads data to understand why the model is underperforming.
2. Experiment with platform-specific models.
3. Create additional features based on the top important features identified.
4. Implement cross-validation for more robust performance estimates.
5. Develop a strategy for real-time or periodic model updates as new data becomes available.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load data
data = pd.read_csv('../data/processed/feature_engineered_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

# Define features and target
features = ['Impressions', 'Clicks', 'Spend', 'CTR', 'CPC', 'CVR', 
            'day_of_week', 'is_weekend', 'month', 'quarter', 
            'Spend_lag_1', 'Spend_lag_7', 'Spend_lag_30', 
            'Clicks_lag_1', 'Clicks_lag_7', 'Clicks_lag_30',
            'Spend_rolling_7d', 'Spend_rolling_30d', 
            'Clicks_rolling_7d', 'Clicks_rolling_30d',
            'Impressions_rolling_7d', 'Impressions_rolling_30d',
            'Conversions_rolling_7d', 'Conversions_rolling_30d',
            'Revenue_rolling_7d', 'Revenue_rolling_30d',
            'CTR_vs_mean', 'CVR_vs_mean', 'Source_encoded', 
            'Campaign_type_encoded', 'is_high_spend_day',
            'Click_Spend_Ratio', 'Revenue_per_Click', 
            'Spend_Acceleration', 'Click_Acceleration']

target = 'ROAS'

# Feature Engineering
data['CVR_rolling_7d'] = data['Conversions_rolling_7d'] / data['Clicks_rolling_7d']
data['CVR_rolling_30d'] = data['Conversions_rolling_30d'] / data['Clicks_rolling_30d']
data['Revenue_per_Impression'] = data['Revenue'] / data['Impressions']
features += ['CVR_rolling_7d', 'CVR_rolling_30d', 'Revenue_per_Impression']

# Separate data by platform
platforms = ['Google Ads', 'Microsoft Ads', 'Meta Ads']
platform_data = {platform: data[data['Source'] == platform] for platform in platforms}

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("--------------------")

# XGBoost model with cross-validation
def train_xgboost_with_cv(X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    
    cv_scores = []
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        xgb_model.fit(X_train, y_train)
        y_pred = xgb_model.predict(X_val)
        cv_scores.append(r2_score(y_val, y_pred))
    
    print(f"Cross-validation R-squared scores: {cv_scores}")
    print(f"Mean R-squared: {np.mean(cv_scores):.4f}")
    
    return xgb_model.fit(X, y)

# Train and evaluate models for each platform
for platform in platforms:
    print(f"\nAnalyzing {platform}")
    platform_df = platform_data[platform]
    X = platform_df[features]
    y = platform_df[target]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    # Train XGBoost model with cross-validation
    xgb_model = train_xgboost_with_cv(X_scaled, y)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': xgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Important Features for {platform}:")
    print(feature_importance.head(10))
    
    # Final evaluation on the entire dataset
    y_pred = xgb_model.predict(X_scaled)
    evaluate_model(y, y_pred, f"XGBoost for {platform}")

# Analyze Google Ads data in more detail
google_ads_data = platform_data['Google Ads']
print("\nGoogle Ads Data Analysis:")
print(google_ads_data[['ROAS', 'CVR', 'Revenue_per_Click', 'Impressions']].describe())

# Check for outliers in Google Ads ROAS
q1 = google_ads_data['ROAS'].quantile(0.25)
q3 = google_ads_data['ROAS'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = google_ads_data[(google_ads_data['ROAS'] < lower_bound) | (google_ads_data['ROAS'] > upper_bound)]
print(f"\nNumber of outliers in Google Ads ROAS: {len(outliers)}")
print("Sample of outliers:")
print(outliers[['Date', 'ROAS', 'CVR', 'Revenue_per_Click', 'Impressions']].head())


Analyzing Google Ads


KeyError: "['Click_Spend_Ratio', 'Revenue_per_Click', 'Spend_Acceleration', 'Click_Acceleration'] not in index"

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load data
data = pd.read_csv('../data/processed/feature_engineered_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

# Feature Engineering
data['Click_Spend_Ratio'] = data['Clicks'] / (data['Spend'] + 1)  # Adding 1 to avoid division by zero
data['Revenue_per_Click'] = data['Revenue'] / (data['Clicks'] + 1)
data['Spend_Acceleration'] = data['Spend_rolling_7d'] - data['Spend_rolling_30d'] / 30 * 7
data['Click_Acceleration'] = data['Clicks_rolling_7d'] - data['Clicks_rolling_30d'] / 30 * 7
data['CVR_rolling_7d'] = data['Conversions_rolling_7d'] / (data['Clicks_rolling_7d'] + 1)
data['CVR_rolling_30d'] = data['Conversions_rolling_30d'] / (data['Clicks_rolling_30d'] + 1)
data['Revenue_per_Impression'] = data['Revenue'] / (data['Impressions'] + 1)

# Define features and target
features = ['Impressions', 'Clicks', 'Spend', 'CTR', 'CPC', 'CVR', 
            'day_of_week', 'is_weekend', 'month', 'quarter', 
            'Spend_lag_1', 'Spend_lag_7', 'Spend_lag_30', 
            'Clicks_lag_1', 'Clicks_lag_7', 'Clicks_lag_30',
            'Spend_rolling_7d', 'Spend_rolling_30d', 
            'Clicks_rolling_7d', 'Clicks_rolling_30d',
            'Impressions_rolling_7d', 'Impressions_rolling_30d',
            'Conversions_rolling_7d', 'Conversions_rolling_30d',
            'Revenue_rolling_7d', 'Revenue_rolling_30d',
            'CTR_vs_mean', 'CVR_vs_mean', 'Source_encoded', 
            'Campaign_type_encoded', 'is_high_spend_day',
            'Click_Spend_Ratio', 'Revenue_per_Click', 
            'Spend_Acceleration', 'Click_Acceleration',
            'CVR_rolling_7d', 'CVR_rolling_30d', 'Revenue_per_Impression']

target = 'ROAS'

# Separate data by platform
platforms = ['Google Ads', 'Microsoft Ads', 'Meta Ads']
platform_data = {platform: data[data['Source'] == platform] for platform in platforms}

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("--------------------")

# XGBoost model with cross-validation
def train_xgboost_with_cv(X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    
    cv_scores = []
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        xgb_model.fit(X_train, y_train)
        y_pred = xgb_model.predict(X_val)
        cv_scores.append(r2_score(y_val, y_pred))
    
    print(f"Cross-validation R-squared scores: {cv_scores}")
    print(f"Mean R-squared: {np.mean(cv_scores):.4f}")
    
    return xgb_model.fit(X, y)

# Train and evaluate models for each platform
for platform in platforms:
    print(f"\nAnalyzing {platform}")
    platform_df = platform_data[platform]
    X = platform_df[features]
    y = platform_df[target]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    # Train XGBoost model with cross-validation
    xgb_model = train_xgboost_with_cv(X_scaled, y)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': xgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Important Features for {platform}:")
    print(feature_importance.head(10))
    
    # Final evaluation on the entire dataset
    y_pred = xgb_model.predict(X_scaled)
    evaluate_model(y, y_pred, f"XGBoost for {platform}")

# Analyze Google Ads data in more detail
google_ads_data = platform_data['Google Ads']
print("\nGoogle Ads Data Analysis:")
print(google_ads_data[['ROAS', 'CVR', 'Revenue_per_Click', 'Impressions']].describe())

# Check for outliers in Google Ads ROAS
q1 = google_ads_data['ROAS'].quantile(0.25)
q3 = google_ads_data['ROAS'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = google_ads_data[(google_ads_data['ROAS'] < lower_bound) | (google_ads_data['ROAS'] > upper_bound)]
print(f"\nNumber of outliers in Google Ads ROAS: {len(outliers)}")
print("Sample of outliers:")
print(outliers[['Date', 'ROAS', 'CVR', 'Revenue_per_Click', 'Impressions']].head())


Analyzing Google Ads
Cross-validation R-squared scores: [0.9375084268094541, 0.9545132863021434, 0.9685334723533959, 0.8703949127935939, 0.9550621621331513]
Mean R-squared: 0.9372

Top 10 Important Features for Google Ads:
                    feature  importance
32        Revenue_per_Click    0.606541
37   Revenue_per_Impression    0.264844
4                       CPC    0.043288
8                     month    0.029092
0               Impressions    0.011762
20   Impressions_rolling_7d    0.009954
1                    Clicks    0.008162
31        Click_Spend_Ratio    0.006530
21  Impressions_rolling_30d    0.005207
16         Spend_rolling_7d    0.003772
XGBoost for Google Ads Performance:
RMSE: 0.0009
MAE: 0.0006
R-squared: 1.0000
--------------------

Analyzing Microsoft Ads
Cross-validation R-squared scores: [0.7748007025165602, 0.9620580080049659, 0.7533612091165727, 0.946754795683404, 0.5069591825311797]
Mean R-squared: 0.7888

Top 10 Important Features for Microsoft Ads:
       

Key Insights:

All platforms show signs of overfitting, with perfect R-squared scores on the entire dataset. This indicates that our models might be too complex or that we need more regularization.
Google Ads performance is much better than initially thought, with high cross-validation scores.
Revenue-related features (Revenue_per_Click, Revenue_per_Impression) are consistently important across all platforms.
The importance of features varies significantly between platforms, suggesting that platform-specific models are indeed beneficial.
Meta Ads show the most inconsistent performance across cross-validation folds, indicating potential issues with data variability or temporal patterns.
Google Ads have a few significant outliers, with ROAS values much higher than the average.

Next steps:

Address overfitting:

Implement regularization in XGBoost (adjust alpha and lambda parameters)
Reduce model complexity (decrease max_depth, min_child_weight)
Increase min_samples_leaf to force the model to generalize better


Handle outliers in Google Ads data:

Investigate the outlier data points to understand if they are errors or genuine high-performing campaigns
Consider using robust scaling or winsorization to reduce the impact of outliers


Improve Meta Ads performance:

Investigate temporal patterns in Meta Ads data
Consider using a more sophisticated time series model (e.g., ARIMA, Prophet) for Meta Ads


Feature engineering:

Create interaction features between top important features
Develop platform-specific features based on the importance rankings


Ensemble modeling:

Combine XGBoost with other algorithms (e.g., LightGBM, CatBoost) to create a more robust ensemble

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor

# Load data
data = pd.read_csv('../data/processed/feature_engineered_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

# Feature Engineering
data['Click_Spend_Ratio'] = data['Clicks'] / (data['Spend'] + 1)  # Adding 1 to avoid division by zero
data['Revenue_per_Click'] = data['Revenue'] / (data['Clicks'] + 1)
data['Spend_Acceleration'] = data['Spend_rolling_7d'] - data['Spend_rolling_30d'] / 30 * 7
data['Click_Acceleration'] = data['Clicks_rolling_7d'] - data['Clicks_rolling_30d'] / 30 * 7
data['CVR_rolling_7d'] = data['Conversions_rolling_7d'] / (data['Clicks_rolling_7d'] + 1)
data['CVR_rolling_30d'] = data['Conversions_rolling_30d'] / (data['Clicks_rolling_30d'] + 1)
data['Revenue_per_Impression'] = data['Revenue'] / (data['Impressions'] + 1)

# Define features and target
features = ['Impressions', 'Clicks', 'Spend', 'CTR', 'CPC', 'CVR', 
            'day_of_week', 'is_weekend', 'month', 'quarter', 
            'Spend_lag_1', 'Spend_lag_7', 'Spend_lag_30', 
            'Clicks_lag_1', 'Clicks_lag_7', 'Clicks_lag_30',
            'Spend_rolling_7d', 'Spend_rolling_30d', 
            'Clicks_rolling_7d', 'Clicks_rolling_30d',
            'Impressions_rolling_7d', 'Impressions_rolling_30d',
            'Conversions_rolling_7d', 'Conversions_rolling_30d',
            'Revenue_rolling_7d', 'Revenue_rolling_30d',
            'CTR_vs_mean', 'CVR_vs_mean', 'Source_encoded', 
            'Campaign_type_encoded', 'is_high_spend_day',
            'Click_Spend_Ratio', 'Revenue_per_Click', 
            'Spend_Acceleration', 'Click_Acceleration',
            'CVR_rolling_7d', 'CVR_rolling_30d', 'Revenue_per_Impression']

target = 'ROAS'

# Separate data by platform
platforms = ['Google Ads', 'Microsoft Ads', 'Meta Ads']
platform_data = {platform: data[data['Source'] == platform] for platform in platforms}

# Function to handle outliers using winsorization
def winsorize(s, limits=(0.05, 0.05)):
    return s.clip(lower=s.quantile(limits[0]), upper=s.quantile(1 - limits[1]))

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")
    print("--------------------")

# Updated XGBoost model with regularization and reduced complexity
def create_xgb_model():
    return XGBRegressor(
        n_estimators=100,
        max_depth=5,
        min_child_weight=2,
        subsample=0.8,
        colsample_bytree=0.8,
        eta=0.1,
        alpha=1,
        reg_lambda=1,
        objective='reg:squarederror',
        random_state=42
    )

#def train_xgboost_with_cv(X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    xgb_model = create_xgb_model()
    
    cv_scores = []
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # Apply winsorization to y_train
        y_train = winsorize(y_train)
        
        xgb_model.fit(X_train, y_train)
        y_pred = xgb_model.predict(X_val)
        cv_scores.append(r2_score(y_val, y_pred))
    
    print(f"Cross-validation R-squared scores: {cv_scores}")
    print(f"Mean R-squared: {np.mean(cv_scores):.4f}")
    
    return xgb_model.fit(X, winsorize(y))

# Train and evaluate models for each platform
for platform in platforms:
    print(f"\nAnalyzing {platform}")
    platform_df = platform_data[platform]
    X = platform_df[features]
    y = platform_df[target]
    
    # Scale features
    scaler = RobustScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    # Train XGBoost model with cross-validation
    xgb_model = train_xgboost_with_cv(X_scaled, y)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': xgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Important Features for {platform}:")
    print(feature_importance.head(10))
    
    # Final evaluation on the entire dataset
    y_pred = xgb_model.predict(X_scaled)
    evaluate_model(y, y_pred, f"XGBoost for {platform}")

# Analyze Google Ads data in more detail
google_ads_data = platform_data['Google Ads']
print("\nGoogle Ads Data Analysis:")
print(google_ads_data[['ROAS', 'CVR', 'Revenue_per_Click', 'Impressions']].describe())

# Check for outliers in Google Ads ROAS
q1 = google_ads_data['ROAS'].quantile(0.25)
q3 = google_ads_data['ROAS'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = google_ads_data[(google_ads_data['ROAS'] < lower_bound) | (google_ads_data['ROAS'] > upper_bound)]
print(f"\nNumber of outliers in Google Ads ROAS: {len(outliers)}")
print("Sample of outliers:")
print(outliers[['Date', 'ROAS', 'CVR', 'Revenue_per_Click', 'Impressions']].head())


Analyzing Google Ads
Cross-validation R-squared scores: [0.9375084268094541, 0.9545172258298527, 0.9685331916020812, 0.8696005397955601, 0.9550630940720943]
Mean R-squared: 0.9370

Top 10 Important Features for Google Ads:
                    feature  importance
32        Revenue_per_Click    0.606541
37   Revenue_per_Impression    0.264844
4                       CPC    0.043288
8                     month    0.029092
0               Impressions    0.011762
20   Impressions_rolling_7d    0.009954
1                    Clicks    0.008162
31        Click_Spend_Ratio    0.006530
21  Impressions_rolling_30d    0.005207
16         Spend_rolling_7d    0.003772
XGBoost for Google Ads Performance:
RMSE: 0.0009
MAE: 0.0006
R-squared: 1.0000
--------------------

Analyzing Microsoft Ads
Cross-validation R-squared scores: [0.7748412426121468, 0.9620580079788191, 0.7533622196453772, 0.946755563527272, 0.5069592052153135]
Mean R-squared: 0.7888

Top 10 Important Features for Microsoft Ads:
       