In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last"

In [44]:
# imports
import sys
sys.path.append('./utils')
from amg_utils import * 
import os
import pickle
import yaml

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from darts import TimeSeries
from darts.models import LightGBMModel, LinearRegressionModel, RegressionModel, XGBModel, CatBoostModel 
from sklearn.linear_model import BayesianRidge, Ridge 

from darts.utils import missing_values
from darts.dataprocessing.transformers import StaticCovariatesTransformer
from darts.explainability.shap_explainer import ShapExplainer
from darts.metrics import rmse, mae


from sklearn.model_selection import train_test_split
from typing import List

In [None]:
## sql 

# fb_tonic_daily_perf_query = """   
   
#    select    
#    a.eventdate,
#    b.fb_created,  


#    a.buyer_initials,
#    a.buyer_name,
#    a.account_currency,
#    a.account_id,
#    a.account_name,
#    a.ad_id, 
#    a.ad_name,
#    a.adset_id,
#    a.adset_name,
#    a.campaign_id,
#    a.campaign_name,
#    --d.keyword, 
#    a.job_type,
#    COALESCE(a.fb_clicks_all, 0) as fb_clicks_all,
#    COALESCE(a.fb_impressions, 0) as fb_impressions,
#    COALESCE(a.fb_leads, 0) as fb_leads,
#    COALESCE(a.fb_link_click, 0) as fb_link_click,
#    a.fb_spend,
#    a.rev_clicks,
#    a.gross_revenue,
#    a.net_revenue,
#    a.fb_clicks_all / COALESCE(NULLIF(a.fb_impressions, 0), 1) as buy_side_ctr,
#    a.net_revenue / COALESCE(NULLIF(a.fb_spend, 0), 1) as roas,
#    a.net_revenue / COALESCE(NULLIF(a.fb_clicks_all, 0), 1) as rpc,
#    (a.fb_spend / COALESCE(NULLIF(a.fb_impressions, 0), 1))*1000 as buy_side_cpm,
#    a.fb_spend / COALESCE(NULLIF(a.fb_clicks_all, 0), 1) as cpc,
#    a.net_revenue / COALESCE(NULLIF(a.rev_clicks, 0), 1) as rpp,
#    a.fb_spend / COALESCE(NULLIF(a.rev_clicks, 0), 1) as cpp,
#    a.net_revenue - a.fb_spend as contrib, 
   
  
#    --b.end_time, 
#    b.budget,    
#    b.budget_type,   
#    b.status, 

#    c.devices,
#    c.countries,
#    c.network,
#    targeting_json:age_max::string AS age_max,
#    targeting_json:age_min::string AS age_min,
#    targeting_json:facebook_positions::string AS facebook_positions,
#    targeting_json:locales::string AS locales,
#    c.adset_schedule,   
#    c.lifetime_budget

#    from PRODUCTION.BD_S2CINTERNAL.v_cm_fb_tonic_daily_combine a 
#    left join SEM_TOOLS.cm_fb_campaign_management.ad_set_latest b on a.adset_id = b.id  and a.campaign_id = b.campaign_id and a.adset_name = b.name
#    left join PRODUCTION.BD_S2CINTERNAL.V_FACEBOOK_ADSET_METADATA c on a.adset_id = c.id and a.eventdate = c.event_date
#   -- left join PRODUCTION.BD_S2CINTERNAL.V_CM_TONIC_KEYWORD_DATA d on d.fb_campaign_name = a.campaign_name
#    where a.job_type= 'final'    
#      and a.buyer_initials = 'GN'   

#   order by  ad_id asc , eventdate asc  
#    """
# fb_tonic_daily_perf = eq(fb_tonic_daily_perf_query)

In [None]:
# load data

# fb_tonic_daily_perf = pd.read_csv("fb_tonic_daily_perf_query.csv", index_col=0, parse_dates=['eventdate'])

# # Crate campaign duration in days variable
# fb_tonic_daily_perf.insert(2, 'campaign_duration', fb_tonic_daily_perf.groupby('campaign_id')['eventdate'].transform(lambda x: (x.max() - x.min()).days))

# # Create campaign end date variable
# fb_tonic_daily_perf.insert(2, 'campaign_end_date', fb_tonic_daily_perf.groupby('campaign_id')['eventdate'].transform("max"))

# # Filter out campaigns with less than {min_campaign_duration} days duration
min_campaign_duration = 4 
# series = fb_tonic_daily_perf[fb_tonic_daily_perf['campaign_duration'] > 4].copy()
# series.ad_id = series.ad_id.astype('str')

# static_cols = [      
# "network",
# "devices"]     

# def one_hot_encode(df, cols):
#     encoded = pd.get_dummies(df[cols])
#     df = df.drop(columns=cols, axis=1)
#     df = pd.concat([df, encoded], axis=1)
#     return df

# series =  one_hot_encode(series, static_cols)

series = pd.read_csv('series.csv', index_col=0, parse_dates=['eventdate'], dtype={'ad_id': str})

In [None]:
## stats 

dataset = "gn_fb_tonic"
print(f"Dataset: {dataset}")
# Total number of campaigns
print(f"Total number of campaigns: {series.ad_id.nunique()}")
# Ads starting from date ... 
print(f"Min eventdate: {series.eventdate.min().date()}") 
print(f"Max eventdate: {series.eventdate.max().date()}")
days_between = (series.eventdate.max().date() - series.eventdate.min().date()).days
print(f"Days between min and max eventdate: {days_between}")
print()
# Duration of campaigns 
campaign_duration_count = series.groupby(["ad_id", "campaign_duration"]).size().value_counts().sort_index().reset_index(name='count').rename(columns={'index':'campaign_duration (days)'})
print(f"Duration of campaigns (filtered to only campaigns of {min_campaign_duration} or more days): \n{campaign_duration_count}")
print()
# mean campaign duration 
print(f"Mean campaign duration: {series.campaign_duration.mean().round(1)} days")

series['fb_created_date'] = pd.to_datetime(series['fb_created']).dt.date
print()

# number of unique ad_ids per date 
campaigns_created_per_date = series.groupby('fb_created_date').ad_id.nunique().reset_index(name='count').sort_values('fb_created_date', ascending=True)
print(f"Number of campaigns created per date: \n {campaigns_created_per_date}")
positive_contrib_ads = series.groupby('ad_id')['contrib'].sum().reset_index()
print()
positive_contrib_ads = positive_contrib_ads[positive_contrib_ads['contrib'] > 0]['ad_id']
print(f"Number of campaigns with positive contribution: \n {positive_contrib_ads.nunique()}")
# print(positive_contrib_ads)
# peek(series[series['ad_id'].isin(positive_contrib_ads[positive_contrib_ads['contrib'] > 0]['ad_id'])])

# peek(series[series['ad_id'] == '120204217187110410'])


# TODO

# extended campaings 


Dataset: gn_fb_tonic
Total number of campaigns: 146
Min eventdate: 2024-01-17
Max eventdate: 2024-02-13
Days between min and max eventdate: 27

Duration of campaigns (filtered to only campaigns of 4 or more days): 
   campaign_duration (days)  count
0                         5      3
1                         6     16
2                         7     42
3                         8     71
4                         9      9
5                        10      2
6                        15      1
7                        16      1
8                        21      1

Mean campaign duration: 7.2 days

Number of campaigns created per date: 
   fb_created_date  count
0      2024-01-18      9
1      2024-01-19     13
2      2024-01-24     39
3      2024-01-25     24
4      2024-01-26     22
5      2024-01-30     39

Number of campaigns with positive contribution: 
 3


In [None]:
# train/val split 

unique_ad_ids = series['ad_id'].unique()

train_ad_ids, val_ad_ids = train_test_split(unique_ad_ids, test_size=0.3, random_state=42)

train = series[series['ad_id'].isin(train_ad_ids)]
val = series[series['ad_id'].isin(val_ad_ids)]

print(f"train: {(train['ad_id'].nunique()/series['ad_id'].nunique())}")
print(f"val: {val['ad_id'].nunique()/series['ad_id'].nunique()}")

print(f"train ads: {train.ad_id.nunique()}")
print(f"val ads: {val.ad_id.nunique()}")


train: 0.6986301369863014
val: 0.3013698630136986
train ads: 102
val ads: 44


In [None]:
# set up timeseries for darts 

time_col = 'eventdate'
group_cols = 'ad_id'
static_cols = ['age_max',
               'age_min',  
                'network_["facebook","instagram"]',   
                'network_["facebook"]',
                'devices_["mobile","desktop"]',
                'devices_["mobile"]'
                ]

# value_col = ['net_revenue', 'fb_spend']

value_col = "roas"

past_covariates = ['fb_spend', 
                   'fb_clicks_all',
                   'fb_impressions',
                   'fb_leads',
                   'rev_clicks',
                   'fb_link_click',                
                   ]

# future_covariates = ["campaign_duration"]


train_ts = TimeSeries.from_group_dataframe(
                            train,
                            time_col= time_col,
                            group_cols= group_cols,
                            static_cols= static_cols,
                            value_cols= value_col ,
                            fill_missing_dates=True,                            
                            freq='D')


val_ts = TimeSeries.from_group_dataframe(
                            val,
                            time_col= time_col,
                            group_cols= group_cols,
                            static_cols= static_cols,
                            value_cols= value_col ,
                            fill_missing_dates=True,                            
                            freq='D')


past_covariates_ts = TimeSeries.from_group_dataframe(
    train,
    time_col=time_col,    
    group_cols=group_cols,
    static_cols= static_cols,
    value_cols=past_covariates,
    freq='D'  
)


past_covariates_vs = TimeSeries.from_group_dataframe(
    val,
    time_col=time_col,
    group_cols=group_cols,
     static_cols= static_cols,
    value_cols=past_covariates,
    freq='D'  
)


# fill missing values darts 

for i, ts in enumerate(train_ts):    
    if not ts.gaps().empty:        
        train_ts[i] = missing_values.fill_missing_values(ts)

for i, ts in enumerate(val_ts):    
    if not ts.gaps().empty:        
        val_ts[i] = missing_values.fill_missing_values(ts)
        
for i, ts in enumerate(past_covariates_ts):    
    if not ts.gaps().empty:        
        past_covariates_ts[i] = missing_values.fill_missing_values(ts)

for i, ts in enumerate(past_covariates_vs):    
    if not ts.gaps().empty:        
        past_covariates_vs[i] = missing_values.fill_missing_values(ts)



transformer = StaticCovariatesTransformer()
train_ts = transformer.fit_transform(train_ts)
val_ts = transformer.fit_transform(val_ts)
past_covariates_ts = transformer.fit_transform(past_covariates_ts)       
past_covariates_vs = transformer.fit_transform(past_covariates_vs)       

In [None]:
## check missing values in timeseries

# for i, ts in enumerate(past_covariates_ts):
#     dataframe = ts.pd_dataframe()  # Convert each TimeSeries to a DataFrame
#     has_nans = dataframe.isna().values.any()  # Check for NaN values
#     print(f"TimeSeries {i} contains NaN values? {has_nans}")
# nan_series = [dataframe.isna().any().any() for dataframe in [ts.pd_dataframe() for ts in all_campaigns_ts]]


In [None]:
with open("experiment_params.yaml", 'r') as stream:
    try:
        experiment_params = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [None]:
def eval_global_model(
    train_ts: List[TimeSeries],    
    val_ts: List[TimeSeries],
    val:pd.DataFrame,
    model_cls,     
    past_covariates_ts: List[TimeSeries],
    past_covariates_vs: List[TimeSeries],    
    model_params: dict,
    forecast_params: dict,  
   

) -> pd.DataFrame:

    
    model = model_cls(**model_params)

    if model_params.get("lags_past_covariates") is not None:
        model.fit(train_ts, past_covariates_ts)
    else:
        model.fit(train_ts)        
    
    backtest = model.historical_forecasts(
                series = val_ts,
                past_covariates = past_covariates_vs,                
               **forecast_params
               )
    
    unique_ad_ids = val['ad_id'].unique()
    positions_df = pd.DataFrame({'ad_id': unique_ad_ids, 'position': range(len(unique_ad_ids))})

    df = pd.DataFrame()

    for i, series in enumerate(backtest):
        backtest_ad_id = positions_df.iloc[i]['ad_id']
        # Convert the TimeSeries object to a DataFrame
        forecast_df = series.pd_dataframe().rename(columns={'roas': 'roas_forecasted'})
        
        # Merge the actual values with the forecasted values
        temp_df = pd.merge(val[val['ad_id'] == backtest_ad_id].sort_values(by='eventdate')[['ad_id', 'eventdate', 'roas']].rename(columns={'roas': 'roas_actual'}),
                        forecast_df, left_on='eventdate', right_index=True, how='left')
        
        # Calculate cumulative sums and residuals
        temp_df['roas_act_cum'] = temp_df['roas_actual'].cumsum()
        temp_df['roas_fcst_cum'] = temp_df['roas_forecasted'].cumsum()
        temp_df['residuals'] = temp_df['roas_forecasted'] - temp_df['roas_actual']
        
        # Add model information
        if model.__class__.__name__ == 'RegressionModel':
            temp_df['model'] = model.model.__class__.__name__
        else:
            temp_df['model'] = model.__class__.__name__
        temp_df['params'] = temp_df.apply(lambda x: {'model_params': model_params, 'forecast_params': forecast_params}, axis=1)
        
        # Append the results to the main DataFrame
        df = pd.concat([df, temp_df], ignore_index=True)

    return df, backtest

In [None]:
def make_forecast_plots(val_ts, backtest):
    
    num_plots = len(val_ts)
    num_cols = 4
    num_rows = num_plots // num_cols + (num_plots % num_cols > 0)

    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, num_rows*5))
    for i, ax in enumerate(axs.flatten()):
        if i < num_plots:
            val_ts[i].plot(ax=ax, label='Actual ROAS')        
            backtest[i].plot(ax=ax, label='Forecasted ROAS')            
            ax.legend()
        else:
            ax.axis('off')
    # plt.tight_layout()
    # plt.show()
    return fig    


In [None]:
def perf_per_ad(
    forecast: pd.DataFrame):
    grouped = forecast.groupby('ad_id').agg(
        # contrib=('roas_actual', 'sum'),
        # predicted_contrib=('roas_forecasted', 'sum'),
        cumulative_roas_actual=('roas_act_cum', 'last'),
        cumulative_roas_predicted=('roas_fcst_cum', 'last'),
        MAE=('residuals', lambda x: (x.abs()).mean()),
        RMSE=('residuals', lambda x: ((x**2).mean())**0.5)
    ).reset_index()
    grouped.columns = [
        'ad_id', 
        # 'contrib', 
        # 'predicted_contrib', 
        'cumulative_roas_actual', 'cumulative_roas_predicted', 'MAE', 'RMSE']
    return grouped

In [None]:
model_list = [

    ("linear_reg", LinearRegressionModel),
    # ("lgbm", LightGBMModel),
    ("ridge",Ridge),
    ("bayes_ridge", BayesianRidge),
    # ("xgboost",XGBModel),
    # ("catboost", "CatBoostModel")
]

In [None]:
def calculate_metrics_and_create_df(val_ts, backtest, model_params):
    rmse_values = [rmse(val_ts[i], backtest[i]) for i in range(len(val_ts))]
    mae_values = [mae(val_ts[i], backtest[i]) for i in range(len(val_ts))]

    # Calculate metrics
    average_rmse = np.mean(rmse_values)
    median_rmse = np.median(rmse_values)
    std_rmse = np.std(rmse_values)
    average_mae = np.mean(mae_values)
    median_mae = np.median(mae_values)
    std_mae = np.std(mae_values)

    # Create DataFrame
    metrics_df = pd.DataFrame({
        'Model Params': [model_params],
        'MAE': [average_mae],
        'RMSE': [average_rmse],
        'Median MAE': [median_mae],
        'Median RMSE': [median_rmse],
        'Std MAE': [std_mae],
        'Std RMSE': [std_rmse]
    })

    return metrics_df

In [43]:
def run_experiments(
    model_list,    
    train_ts: List[TimeSeries],    
    val_ts: List[TimeSeries],
    val: pd.DataFrame,     
    past_covariates_ts: List[TimeSeries],
    past_covariates_vs: List[TimeSeries],    
    experiment_params: dict or list,
):
    results_forecasts_dict = {}
    metrics_df = pd.DataFrame()      

    if isinstance(experiment_params, dict):
        experiment_params = [experiment_params]
    
    results_dir = "exp_results"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    for model_name, model_class in model_list:   
        for params_set in experiment_params:
            model_params = params_set.get('model_params', {})
            forecast_params = params_set.get('forecast_params', {})

            if model_class in [Ridge, BayesianRidge, LinearRegressionModel]: 
                for key in ["categorical_static_covariates", "verbose", "likelihood", "quantiles", "random_state"]:
                    model_params.pop(key, None)
            if model_class in [Ridge, BayesianRidge]: 
                model = RegressionModel
                model_params['model'] = model_class()
            else:
                model = model_class
            
            forecast, backtest = eval_global_model(
                train_ts,    
                val_ts,
                val,
                model, 
                past_covariates_ts,
                past_covariates_vs,    
                model_params,
                forecast_params
            )
            
            params_set_name = f"params_{experiment_params.index(params_set) + 1}"
            forecast.to_csv(f"{results_dir}/{model_name}_{params_set_name}_forecast.csv")
            perf_per_ad(forecast).to_csv(f"{results_dir}/{model_name}_{params_set_name}_results_per_ad.csv")
            with open(f"{results_dir}/{model_name}_{params_set_name}_backtest.pkl", 'wb') as f:
                pickle.dump(backtest, f)

            results_forecasts_dict[f"{model_name}_{params_set_name}_forecast"] = forecast
            results_forecasts_dict[f"{model_name}_{params_set_name}_backtest"] = backtest
            
            fig = make_forecast_plots(val_ts, backtest)
            fig.savefig(f"{results_dir}/{model_name}_{params_set_name}_backtest.png")

            temp_metrics_df = calculate_metrics_and_create_df(val_ts, backtest, model_params)
            temp_metrics_df['Model Name'] = model_name
            temp_metrics_df['Params Set Name'] = params_set_name
            metrics_df = pd.concat([metrics_df, temp_metrics_df], ignore_index=True)
            plt.close(fig)

            shap_explain = ShapExplainer(model, train_ts, past_covariates_ts)
            fig=plt.gcf()
            shap_explain.summary_plot(show=False,)
            fig.savefig(f"{results_dir}/{model_name}_{params_set_name}_shap_summary.png")
            plt.close('all')

    return results_forecasts_dict, metrics_df

In [None]:
# results_forecasts_dict, metrics_df = run_experiments(
#     model_list,
#     train_ts,
#     val_ts,
#     val,
#     past_covariates_ts,
#     past_covariates_vs,
#     experiment_params,
#     )

In [None]:
# metrics_df