In [None]:
import sys
sys.path.append('..')

import pandas as pd
import pathlib
from definitions import ROOT_DIR
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import QuantileRegressor

In [None]:
%history -f history.txt

In [None]:
def getScore_nips(reward, cpa, cpa_constraint):
    beta = 2
    penalty = 1
    if cpa > cpa_constraint:
        coef = cpa_constraint / (cpa + 1e-10)
        penalty = pow(coef, beta)
    return penalty * reward

In [None]:
period_list = list(range(7, 12))
df_list = []
for period in period_list:
    data_path = ROOT_DIR / "data" / "raw_traffic_parquet" / f"period-{period}.parquet"
    df = pd.read_parquet(data_path)
    df_list.append(df[df.isExposed == 1])
df = pd.concat(df_list)

In [None]:
# exposed_df = df[df["isExposed"] == 1] We already filter exposed data
beta_hat = (df.pValue)

beta_hat_stoch = beta_hat[df.pValueSigma > 1e-4] / df.pValueSigma[df.pValueSigma > 1e-4]

plt.hist(beta_hat[np.abs(beta_hat) < 0.5], bins=100)
plt.show()
plt.hist(beta_hat[np.abs(beta_hat) > 0.5], bins=100)
plt.show()

In [None]:
plt.hist(beta_hat_stoch[beta_hat_stoch > 0.5], bins=20)
plt.show()

In [None]:
# plt.hist(df.pValueSigma / df.pValue, bins=100)
plt.plot(df.pValue, df.pValueSigma, '.')
plt.show()

In [None]:
pp = np.linspace(np.min(df.pValue), np.max(df.pValue), 50)

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
for slot in range(3, 0, -1):
    conversion_mean_list = []
    pvalue_mean_list = []
    count_list = []
    slot_df = df[df.adSlot == slot]
    for l, r in zip(pp[:-1], pp[1:]):
        mask = (slot_df.pValue >= l) & (slot_df.pValue <= r)
        conversion_mean_list.append(slot_df.conversionAction[mask].mean())
        pvalue_mean_list.append(slot_df.pValue[mask].mean())
        count_list.append(mask.sum())
    axs[0].plot(pvalue_mean_list, conversion_mean_list, label=slot)
    axs[1].plot(pvalue_mean_list, count_list, label=slot)
axs[0].plot(pvalue_mean_list, pvalue_mean_list, "r")
plt.legend()
plt.show()

In [None]:
for period in range(7, 28):
    print(f"Period {period}")
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    period_df = df[df["deliveryPeriodIndex"] == period]
    for slot in range(3, 0, -1):
        conversion_mean_list = []
        pvalue_mean_list = []
        count_list = []
        slot_df = period_df[period_df.adSlot == slot]
        for l, r in zip(pp[:-1], pp[1:]):
            mask = (slot_df.pValue >= l) & (slot_df.pValue <= r)
            conversion_mean_list.append(slot_df.conversionAction[mask].mean())
            pvalue_mean_list.append(slot_df.pValue[mask].mean())
            count_list.append(mask.sum())
        axs[0].plot(pvalue_mean_list, conversion_mean_list, label=slot)
        axs[1].plot(pvalue_mean_list, count_list, label=slot)
    axs[0].plot(pvalue_mean_list, pvalue_mean_list, "r")
    axs[0].set_ylim((0, 0.006))
    plt.legend()
    plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import numpy as np

# Remove the outliers from the dataset
# red_df = df[df.pValue < 4e-3]
red_df = df
pv = red_df.pValue.to_numpy()
pvs = red_df.pValueSigma.to_numpy()

X = np.column_stack((pv, pvs, pv * pvs, pv ** 2, pvs ** 2))  # Input features
# X = red_df.pValue.to_numpy().reshape(-1, 1)
y = red_df.conversionAction.to_numpy()  # Binary outcomes (0 or 1)

# Fit logistic regression model
# model = LogisticRegression(C=1)
model = LinearRegression()
model.fit(X, y)

# # Predict the probability (beta)
# predicted_beta = model.predict_proba(X)[:, 1]  # Probability of conversion (beta)


In [None]:
coef = model.coef_
coef

In [None]:
model.predict(X)

In [None]:
plt.plot(pv, model.predict(X), ".")
plt.plot(np.linspace(min(pv), max(pv), 10), np.linspace(min(pv), max(pv), 10))
# plt.plot(pv, y, '.')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import numpy as np

# Remove the outliers from the dataset
success_df = df[df.conversionAction == 1].reset_index()
failure_df = df[df.conversionAction == 0].reset_index()
failure_df = failure_df.loc[np.random.choice(failure_df.index, len(success_df), replace=False)]
red_df = pd.concat((success_df, failure_df))

shuffle_indices = np.random.permutation(len(red_df))
X = np.column_stack((red_df.pValue.to_numpy(), red_df.pValueSigma.to_numpy()))  # Input features
X = X[shuffle_indices]
# X = red_df.pValue.to_numpy().reshape(-1, 1)
y = red_df.conversionAction.to_numpy()  # Binary outcomes (0 or 1)
y = y[shuffle_indices]

# Fit logistic regression model
# model = LogisticRegression()
model = LinearRegression()
model.fit(X, y)

# # Predict the probability (beta)
# predicted_beta = model.predict_proba(X)[:, 1]  # Probability of conversion (beta)


In [None]:
plt.hist(success_df.pValue, bins=30, label="success", alpha=0.3)
plt.hist(failure_df.pValue, bins=30, label="failure", alpha=0.3)
plt.legend()
plt.show()

In [None]:
model.coef_

In [None]:
success_df = df[df.conversionAction == 1].reset_index()
success_df = success_df.loc[np.random.choice(success_df.index, 10000, replace=False)]
failure_df = df[df.conversionAction == 0].reset_index()
failure_df = failure_df.loc[np.random.choice(failure_df.index, 10000, replace=False)]
balanced_df = pd.concat([success_df, failure_df])

plt.hist(success_df.pValue, bins=30, alpha=0.5, label="success")
plt.hist(failure_df.pValue, bins=30, alpha=0.5, label="failure")
plt.legend()
plt.show()


In [None]:
failure_df

In [None]:
slot_df = df.groupby("adSlot").mean()
slot_df["pValueAdjusted"] = slot_df["pValue"] * slot_df["isExposed"]
slot_df

In [None]:
# Seems like all slots are more or less equally effective
slot_df.pValue / slot_df.cost

In [None]:
slot_df.conversionAction / (slot_df.cost * slot_df.isExposed)

In [None]:
slot_df.pValueAdjusted / slot_df.conversionAction

In [None]:
from scipy.stats import norm

def expected_truncated_gaussian(pvalues, pvalue_sigmas):
    # Compute the alpha values
    alpha = np.zeros_like(pvalues)
    alpha[pvalue_sigmas != 0] = -pvalues[pvalue_sigmas != 0] / pvalue_sigmas[pvalue_sigmas != 0]
    
    # Compute the PDF and CDF of the standard normal distribution at alpha
    phi_alpha = norm.pdf(alpha)
    Phi_alpha = norm.cdf(alpha)
    
    # Compute the expected value for the truncated Gaussian
    expected_values = pvalues + pvalue_sigmas * (phi_alpha / (1 - Phi_alpha))
    
    return expected_values

In [None]:
for ad_slot in range(1, 4):
    pvalue_list = []
    conversion_list = []
    eff_pvalue_list = []
    for period in period_list:
        period_df = df[df["deliveryPeriodIndex"] == period]
        pvalues = period_df[period_df.adSlot == ad_slot].pValue.to_numpy()
        pvalue_sigmas = period_df[period_df.adSlot == ad_slot].pValueSigma.to_numpy()
        conversions = period_df[period_df.adSlot == ad_slot].conversionAction.to_numpy()
        exposed_frac = period_df[period_df.adSlot == ad_slot].isExposed.to_numpy().mean()
        pvalue_list.append(pvalues.mean())
        conversion_list.append(conversions.mean())
        eff_pvalue_list.append(pvalues.mean() * exposed_frac)
    plt.plot(period_list, [p / c for p, c in zip(eff_pvalue_list, conversion_list)])
        # exp_vals = expected_truncated_gaussian(pvalues, pvalue_sigmas)
        # print(f"Period {period}, exp_vals: {exp_vals.mean()}, p-values: {pvalues.mean()}, conversions: {conversions.mean()}")

In [None]:
advertiser_list = list(range(0, 48))
for period in period_list:
    period_df = df[df["deliveryPeriodIndex"] == period]
    for ad_slot in range(1, 4):
        pvalue_list = []
        conversion_list = []
        eff_pvalue_list = []
        for advertiser in advertiser_list:
            advertiser_df = period_df[period_df.advertiserNumber == advertiser]
            pvalues = advertiser_df[advertiser_df.adSlot == ad_slot].pValue.to_numpy()
            pvalue_sigmas = advertiser_df[advertiser_df.adSlot == ad_slot].pValueSigma.to_numpy()
            conversions = advertiser_df[advertiser_df.adSlot == ad_slot].conversionAction.to_numpy()
            exposed_frac = advertiser_df[advertiser_df.adSlot == ad_slot].isExposed.to_numpy().mean()
            pvalue_list.append(pvalues.mean())
            conversion_list.append(conversions.mean())
            eff_pvalue_list.append(pvalues.mean() * exposed_frac)
        plt.plot(advertiser_list, [p / c for p, c in zip(eff_pvalue_list, conversion_list)])
    plt.show()
        # exp_vals = expected_truncated_gaussian(pvalues, pvalue_sigmas)
        # print(f"Period {period}, exp_vals: {exp_vals.mean()}, p-values: {pvalues.mean()}, conversions: {conversions.mean()}")

In [None]:
(period_df[period_df.adSlot == 3].conversionAction == 0).all()

In [None]:
exp_vals = expected_truncated_gaussian(pvalues, pvalue_sigmas)

In [None]:
np.mean(exp_als)

### All periods pv / E[conv] per slot
adSlot
0.0         NaN
1.0    1.043351
2.0    1.037957
3.0    1.026732
dtype: float32

In [None]:
df.groupby("advertiserNumber").agg({
    "budget": ["mean", "min", "max"],
    "CPAConstraint": ["mean", "min", "max"],
})

In [None]:
df.groupby("advertiserNumber").agg({
    "budget": ["mean", "min", "max"],
    "CPAConstraint": ["mean", "min", "max"],
})

In [None]:
df.keys()

In [None]:
# Convert all traffic data to parquet
first_period = 7
last_period = 27
for period in range(first_period, last_period + 1):
    data_path = ROOT_DIR / "data" / "raw_traffic_final" / f"period-{period}.csv"
    print(f"Loading {data_path}")
    df = pd.read_csv(data_path, dtype="float32")
    pathlib.Path(ROOT_DIR / "data" / "raw_traffic_final_parquet").mkdir(parents=True, exist_ok=True)
    out_path = ROOT_DIR / "data" / "raw_traffic_final_parquet" / f"period-{period}.parquet"
    print(f"Saving to {out_path}")
    df.to_parquet(out_path)
    

In [None]:
# # Merge all the traffic data into a single dataframe (converted to float32 for memory efficiency)
# df_list = []
# for period in range(7, 14):
#     data_path = ROOT_DIR / "data" / "traffic" / f"period-{period}.csv"
#     print(f"Loading {data_path}")
#     df = pd.read_csv(data_path, dtype="float32")
#     df_list.append(df)

# # Create a single dataframe
# print("Concatenating dataframes")
# df = pd.concat(df_list, ignore_index=True).reset_index()
# df.drop(columns=["index"], inplace=True)

# # Save the dataframe in an efficient format
# print("Saving dataframe")
# df.to_parquet(ROOT_DIR / "data" / "traffic" / "all_periods.parquet")

In [None]:
df = pd.read_csv(ROOT_DIR / "data" / "traffic" / "training_data_16" / "training_data_all-rlData.csv")

In [None]:
import ast


def safe_literal_eval(val):
    if pd.isna(val):
        return val  # 如果是NaN，返回NaN
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        print(ValueError)
        return val 
    
safe_literal_eval(df["state"].loc[0]).__len__()

In [None]:
# Load all periods df
df = pd.read_parquet(ROOT_DIR / "data" / "traffic" / "all_periods.parquet")

In [None]:
# Print limits for budget and target cpa
budget_limits = (df.budget.min(), df.budget.max())
cpa_limits = (df.CPAConstraint.min(), df.CPAConstraint.max())
print("Budget lim:", budget_limits, "CPA lim:", cpa_limits)

In [None]:
df.head()

In [None]:
# Compute the predicted conversion action as product between pValue and isExposed
conversion_action_pred = df["pValue"] * df["isExposed"]
df["conversionAction_pred"] = conversion_action_pred

# Compute cpa
cpa_df = df.groupby(["advertiserNumber", "deliveryPeriodIndex"]
                    ).agg({"cost": "sum",
                           "conversionAction": "sum",
                           "CPAConstraint": "mean",
                           "budget": "mean",
                           "conversionAction_pred": "sum",
                           "advertiserCategoryIndex": "mean",
                           "pValue": "mean"}
                          ).reset_index()
cpa_df["cpa"] = cpa_df["cost"] / cpa_df["conversionAction"]
# cpa_df.set_index("advertiserNumber", inplace=True)
cpa_df["score"] = cpa_df.apply(lambda x: getScore_nips(x["conversionAction"], x["cpa"], x["CPAConstraint"]), axis=1)
cpa_df

In [None]:
# Plot score as a function of budget (the budget per advertiser is constant)
fig, ax = plt.subplots()
for advertiser in cpa_df["advertiserNumber"].unique():
    advertiser_df = cpa_df[cpa_df["advertiserNumber"] == advertiser]
    # Plot mean +- std of the score
    budget = advertiser_df["budget"].mean()
    score = advertiser_df["score"].mean()
    std = advertiser_df["score"].std()
    ax.errorbar(budget, score, yerr=std, fmt='o', label=f"Advertiser {advertiser}")
    
ax.set_xlabel("Budget")
ax.set_ylabel("Score")
    # ax.plot(advertiser_df["budget"], advertiser_df["score"], label=f"Advertiser {advertiser}")


In [None]:
filter_list = []

In [None]:
# Fit a linear regression to the score as a function of log budget
quantile = 0.8
X = np.log(cpa_df["budget"]).values.reshape(-1, 1)
y = cpa_df["score"].values
# reg = LinearRegression(fit_intercept=True).fit(X, y)
reg = QuantileRegressor(quantile=quantile, alpha=0).fit(X, y)

# Select the elements of cpa_df whose score is larger than the regressed value
score_pred = reg.predict(X)
budget_filter = cpa_df["score"] > score_pred
filter_list.append(budget_filter)
print(f"Fraction of dataset with score > regressed value: {budget_filter.mean()}")

# Plot the linear regression
fig, ax = plt.subplots()

# Sort x and y according to x
sort_idx = np.argsort(X, axis=0).flatten()
ax.scatter(cpa_df["budget"][budget_filter], cpa_df["score"][budget_filter], color="green", alpha=0.2)
ax.scatter(cpa_df["budget"][~budget_filter], cpa_df["score"][~budget_filter], color="blue", alpha=0.2)
ax.plot(np.exp(X[sort_idx]), reg.predict(X[sort_idx]), color="red")
ax.set_xlabel("Budget")
ax.set_ylabel("Score")
fig.show()

In [None]:
# Compute median score per CPAConstraint
median_score_df = cpa_df.groupby("CPAConstraint").agg({"score": lambda x: np.quantile(x, 0.8)})
cpa_filter = cpa_df["score"] > median_score_df.loc[cpa_df["CPAConstraint"]].values.flatten()
filter_list.append(cpa_filter)

# plot score as a function of CPAConstraint
x_val = cpa_df["CPAConstraint"]
fig, ax = plt.subplots()
ax.scatter(x_val[cpa_filter], cpa_df["score"][cpa_filter], color="green", alpha=0.2)
ax.scatter(x_val[~cpa_filter], cpa_df["score"][~cpa_filter], color="blue", alpha=0.2)
ax.set_xlabel("CPAConstraint")
ax.set_ylabel("Score")
fig.show()

In [None]:
# Compute median score per CPAConstraint transformed
median_score_df = cpa_df.groupby("CPAConstraint").agg({"score": lambda x: np.quantile(x, 0.8)})
cpa_filter = cpa_df["score"] > median_score_df.loc[cpa_df["CPAConstraint"]].values.flatten()
filter_list.append(cpa_filter)

# plot score as a function of CPAConstraint
x_val = (cpa_df["CPAConstraint"] - 8).abs()
fig, ax = plt.subplots()
ax.scatter(x_val[cpa_filter], cpa_df["score"][cpa_filter], color="green", alpha=0.2)
ax.scatter(x_val[~cpa_filter], cpa_df["score"][~cpa_filter], color="blue", alpha=0.2)
ax.set_xlabel("CPAConstraint - transformed")
ax.set_ylabel("Score")
fig.show()

In [None]:
# Compute median score per advertiser category
median_score_df = cpa_df.groupby("advertiserCategoryIndex").agg({"score": lambda x: np.quantile(x, 0.9)})
category_filter = cpa_df["score"] > median_score_df.loc[cpa_df["advertiserCategoryIndex"]].values.flatten()
filter_list.append(category_filter)

# Plot score as a function of advertiser category
fig, ax = plt.subplots()
ax.scatter(cpa_df["advertiserCategoryIndex"][category_filter], cpa_df["score"][category_filter], color="green", alpha=0.2)
ax.scatter(cpa_df["advertiserCategoryIndex"][~category_filter], cpa_df["score"][~category_filter], color="blue", alpha=0.2)
ax.set_xlabel("Advertiser Category")
ax.set_ylabel("Score")
fig.show()

In [None]:
# Compute median score per delivery period
median_score_df = cpa_df.groupby("deliveryPeriodIndex").agg({"score": lambda x: np.quantile(x, 0.9)})
period_filter = cpa_df["score"] > median_score_df.loc[cpa_df["deliveryPeriodIndex"]].values.flatten()
filter_list.append(period_filter)

# Plot score as a function of delivery period
fig, ax = plt.subplots()
ax.scatter(cpa_df["deliveryPeriodIndex"][period_filter], cpa_df["score"][period_filter], color="green", alpha=0.2)
ax.scatter(cpa_df["deliveryPeriodIndex"][~period_filter], cpa_df["score"][~period_filter], color="blue", alpha=0.2)
ax.set_xlabel("Delivery period")
ax.set_ylabel("Score")
fig.show()

In [None]:
# Compute linear regression for the pValue
X = cpa_df["pValue"].values.reshape(-1, 1)
y = cpa_df["score"].values
# reg = LinearRegression(fit_intercept=True).fit(X, y)
quantile = 0.8
reg = QuantileRegressor(quantile=quantile, alpha=0).fit(X, y)

# Select the elements of cpa_df whose score is larger than the regressed value
score_pred = reg.predict(X)
pValue_filter = cpa_df["score"] > score_pred
filter_list.append(pValue_filter)

# Plot the linear regression
fig, ax = plt.subplots()
sort_idx = np.argsort(X, axis=0).flatten()
ax.scatter(cpa_df["pValue"][pValue_filter], cpa_df["score"][pValue_filter], color="green", alpha=0.2)
ax.scatter(cpa_df["pValue"][~pValue_filter], cpa_df["score"][~pValue_filter], color="blue", alpha=0.2)
ax.plot(X[sort_idx], reg.predict(X[sort_idx]), color="red")
ax.set_xlabel("pValue")
ax.set_ylabel("Score")
fig.show()

In [None]:
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_squared_error

# # Assuming your dataset is loaded into a DataFrame called df
# # and the features and target columns are specified as below
# features = ["deliveryPeriodIndex", "advertiserCategoryIndex", "pValue", "budget", "CPAConstraint"]
# target = "score"

# # Separate the features (X) and target (y)
# X = cpa_df[features]
# y = cpa_df[target]

# # Initialize the RandomForestRegressor
# model = RandomForestRegressor(n_estimators=100, random_state=42)

# # Initialize KFold with 10 splits
# kf = KFold(n_splits=10, shuffle=True, random_state=42)

# # To store actual and predicted values
# actual_scores = []
# predicted_scores = []

# # To store feature importances
# feature_importances = np.zeros(len(features))

# # Perform 10-fold cross-validation
# for train_index, test_index in kf.split(X):
#     # Split the data into train and test sets
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
#     # Train the model on the training set
#     model.fit(X_train, y_train)
    
#     # Predict the score for the test set
#     y_pred = model.predict(X_test)
    
#     # Store actual and predicted values
#     actual_scores.extend(y_test)
#     predicted_scores.extend(y_pred)

#     # Accumulate feature importances
#     feature_importances += model.feature_importances_

# # Average the feature importances over all folds
# feature_importances /= kf.get_n_splits()


# # Convert the results to a DataFrame for easier analysis
# results_df = pd.DataFrame({
#     "actual_score": actual_scores,
#     "predicted_score": predicted_scores
# })

# # Calculate the overall mean squared error
# mse = mean_squared_error(results_df["actual_score"], results_df["predicted_score"])
# print(f"Mean Squared Error: {mse}")

# # Optionally, you can now identify successful campaigns
# # For example, by checking where actual_score > predicted_score
# alpha = 1.3
# results_df["successful_campaign"] = results_df["actual_score"] > alpha * results_df["predicted_score"]

# print(results_df["successful_campaign"].mean())

# # Feature Importance
# importance_df = pd.DataFrame({
#     'feature': features,
#     'importance': feature_importances
# })

# # Sort by importance
# importance_df = importance_df.sort_values(by='importance', ascending=False)

# print("Feature Importances:")
# print(importance_df)

In [None]:
# # Assuming your dataset is loaded into a DataFrame called df
# # and the features and target columns are specified as below
# features = ["deliveryPeriodIndex", "advertiserCategoryIndex", "pValue", "budget", "CPAConstraint"]
# target = "score"

# # Separate the features (X) and target (y)
# X = cpa_df[features]
# y = cpa_df[target]

# # Initialize the QuantileRegressor
# model = QuantileRegressor(quantile=0.7, alpha=0)

# # Initialize KFold with 10 splits
# kf = KFold(n_splits=50, shuffle=True, random_state=42)

# # To store actual and predicted values
# actual_scores = []
# predicted_scores = []

# # Perform 10-fold cross-validation
# for train_index, test_index in kf.split(X):
#     # Split the data into train and test sets
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
#     # Train the model on the training set
#     model.fit(X_train, y_train)
    
#     # Predict the score for the test set
#     y_pred = model.predict(X_test)
    
#     # Store actual and predicted values
#     actual_scores.extend(y_test)
#     predicted_scores.extend(y_pred)


# # Convert the results to a DataFrame for easier analysis
# results_df = pd.DataFrame({
#     "actual_score": actual_scores,
#     "predicted_score": predicted_scores
# })

# # Calculate the overall mean squared error
# mse = mean_squared_error(results_df["actual_score"], results_df["predicted_score"])
# print(f"Mean Squared Error: {mse}")

# # Optionally, you can now identify successful campaigns
# # For example, by checking where actual_score > predicted_score
# alpha = 1
# results_df["successful_campaign"] = results_df["actual_score"] > alpha * results_df["predicted_score"]

# print(results_df["successful_campaign"].mean())

# # # Feature Importance
# # importance_df = pd.DataFrame({
# #     'feature': features,
# #     'importance': feature_importances
# # })

# # # Sort by importance
# # importance_df = importance_df.sort_values(by='importance', ascending=False)

# # print("Feature Importances:")
# # print(importance_df)

In [None]:
final_filter = np.any(filter_list, axis=0)
print(f"Fraction of dataset kept: {final_filter.mean()}")
# final_filter = results_df["successful_campaign"]

# Fit a linear regression to the score as a function of log budget
X = np.log(cpa_df["budget"]).values.reshape(-1, 1)
y = cpa_df["score"].values
reg = LinearRegression(fit_intercept=True).fit(X, y)

# Plot the linear regress
# 
# ion
fig, ax = plt.subplots()

# Sort x and y according to x
sort_idx = np.argsort(X, axis=0).flatten()
ax.scatter(cpa_df["budget"][~final_filter], cpa_df["score"][~final_filter], color="blue", alpha=0.2)
ax.scatter(cpa_df["budget"][final_filter], cpa_df["score"][final_filter], color="green", alpha=0.5)
ax.plot(np.exp(X[sort_idx]), reg.predict(X[sort_idx]), color="red")
ax.set_xlabel("Budget")
ax.set_ylabel("Score")
fig.show()


# plot score as a function of CPAConstraint
fig, ax = plt.subplots()
ax.scatter(cpa_df["CPAConstraint"][~final_filter], cpa_df["score"][~final_filter], color="blue", alpha=0.2)
ax.scatter(cpa_df["CPAConstraint"][final_filter], cpa_df["score"][final_filter], color="green", alpha=0.5)
ax.set_xlabel("CPAConstraint")
ax.set_ylabel("Score")
fig.show()


# Plot score as a function of advertiser category
fig, ax = plt.subplots()
ax.scatter(cpa_df["advertiserCategoryIndex"][~final_filter], cpa_df["score"][~final_filter], color="blue", alpha=0.2)
ax.scatter(cpa_df["advertiserCategoryIndex"][final_filter], cpa_df["score"][final_filter], color="green", alpha=0.5)
ax.set_xlabel("Advertiser Category")
ax.set_ylabel("Score")
fig.show()

# Compute linear regression for the pValue
X = cpa_df["pValue"].values.reshape(-1, 1)
y = cpa_df["score"].values
reg = LinearRegression(fit_intercept=True).fit(X, y)

# Plot the linear regression
fig, ax = plt.subplots()
sort_idx = np.argsort(X, axis=0).flatten()
ax.scatter(cpa_df["pValue"][~final_filter], cpa_df["score"][~final_filter], color="blue", alpha=0.2)
ax.scatter(cpa_df["pValue"][final_filter], cpa_df["score"][final_filter], color="green", alpha=0.5)
ax.plot(X[sort_idx], reg.predict(X[sort_idx]), color="red")
ax.set_xlabel("pValue")
ax.set_ylabel("Score")
fig.show()

In [None]:
print(df.shape)

In [None]:
# Filter the initial dataframe according to the final filter
filtered_cpa_df = cpa_df[final_filter]

# Select in the original df the campaigns where the couple (advertiserNumber, deliveryPeriodIndex) is in the filtered cpa_df
df_list = []
for adn, dp in zip(filtered_cpa_df["advertiserNumber"], filtered_cpa_df["deliveryPeriodIndex"]):
    print(adn, dp)
    df_list.append(df[(df["advertiserNumber"] == adn) & (df["deliveryPeriodIndex"] == dp)])
top_campaign_df = pd.concat(df_list)
# df_filter = df[["advertiserNumber", "deliveryPeriodIndex"]].apply(lambda x: (x["advertiserNumber"], x["deliveryPeriodIndex"]) in zip(filtered_cpa_df["advertiserNumber"], filtered_cpa_df["deliveryPeriodIndex"]), axis=1)
# top_campaign_df = df[df_filter]

print(top_campaign_df.shape)


In [None]:
# Save the filtered dataframe
out_path = ROOT_DIR / "data" / "traffic" / "top_campaigns_quantile.parquet"
top_campaign_df.to_parquet(out_path)