In [1]:
using CSV, DataFrames, CategoricalArrays, Plots, Statistics

seed = 42

df_train = CSV.read("data/train_tfidf.csv", DataFrame)
df_test = CSV.read("data/test_tfidf.csv", DataFrame);

In [4]:
num_bids = length(findall(!iszero, df_train[!,"Avg. CPC"])) + length(findall(!iszero, df_test[!,"Avg. CPC"]))
print("Number of successful bids: " * string(num_bids))

Number of successful bids: 14229

In [5]:
total_cpc = sum(df_train[!,"Avg. CPC"]) + sum(df_test[!,"Avg. CPC"])
print("Total CPC: " * string(total_cpc))

Total CPC: 68096.51

In [7]:
profit = sum(df_train[!,"Conv. value"] .- df_train[!,"Avg. CPC"] .* df_train[!,"Clicks"])
        + sum(df_test[!,"Conv. value"] .- df_test[!,"Avg. CPC"] .* df_test[!,"Clicks"])

199070.58000000002

## Predict Conversion Value

In [2]:
function get_X(df)
    features = [
        "Match type",
        "Region",
        "day_of_week",
        "is_weekend",
        "month",
        "is_public_holiday",
#         "avg_bid",
        "Avg. CPC"
    ]

    # append tfidf_0 ... tfidf_49
    append!(features, ["tfidf_$(i)" for i in 0:49])
    
    return df[!,features]
end

function get_y_conv_value(df)
    return df."Conv. value"
end

function get_y_clicks(df)
    return df.Clicks
end

function get_mse(grid, X, y)
    mse = IAI.score(grid, X, y, criterion=:mse)

    println("MSE: ", round(mse, digits=4))
    
    return mse
end

get_mse (generic function with 1 method)

In [3]:
target = "conversion"
X_train = get_X(df_train)
y_train = get_y_conv_value(df_train)

X_test = get_X(df_test)
y_test = get_y_conv_value(df_test);

In [7]:
target = "clicks"
X_train = get_X(df_train)
y_train = get_y_clicks(df_train)

X_test = get_X(df_test)
y_test = get_y_clicks(df_test);

In [8]:
grid_lr = IAI.GridSearch(
    IAI.OptimalFeatureSelectionRegressor(
        random_seed = seed
    ),
    sparsity = [10,15,20,25]
)

IAI.fit_cv!(grid_lr, X_train, y_train, validation_criterion=:mse, n_folds=5)
get_mse(grid_lr, X_test, y_test)

lnr = IAI.get_learner(grid_lr)
IAI.write_json("models/lr_" * target * ".json", lnr)

MSE: 0.2044


27729

In [9]:
lnr

Fitted OptimalFeatureSelectionRegressor:
  Constant: 1.14021
  Weights:
    Avg. CPC:                0.142413
    Match type=Broad match:  5.41894
    Match type=Exact match: -0.513683
    Region=A:               -0.65995
    Region=B:               -0.504572
    Region=C:                1.88284
    month:                  -0.0359214
    tfidf_18:               -1.39294
    tfidf_2:                -1.06114
    tfidf_21:                2.72231
    tfidf_23:                11.6855
    tfidf_26:               -6.91408
    tfidf_27:               -9.20287
    tfidf_28:               -3.98733
    tfidf_29:               -1.95558
    tfidf_31:                6.26156
    tfidf_32:               -4.24741
    tfidf_33:               -2.95707
    tfidf_35:                4.26234
    tfidf_37:                3.38764
    tfidf_38:               -1.93194
    tfidf_4:                 1.3971
    tfidf_40:                5.45512
    tfidf_41:                4.81418
    tfidf_7:                -2.19986

In [53]:
grid_oct =  IAI.GridSearch(
    IAI.OptimalTreeRegressor(
        random_seed = seed,
    ), 
    max_depth = [2,4,6,8],
    minbucket = [0.01, 0.02, 0.05],    
)

IAI.fit_cv!(grid_oct, X_train, y_train, validation_criterion=:mse, n_folds=5)
mse = get_mse(grid_oct, X_test, y_test)

lnr = IAI.get_learner(grid_oct)
IAI.write_json("models/oct_" * target * ".json", lnr)

MSE: 0.3459


10059259

In [54]:
grid_rf = IAI.GridSearch(
    IAI.RandomForestRegressor(
        random_seed=seed,
    ),
    max_depth=[2,4,6,8],
    minbucket = [0.01, 0.02, 0.05],
    num_trees = [20,25,50,100],
)

IAI.fit_cv!(grid_rf, X_train, y_train, validation_criterion=:mse, n_folds=5)
get_mse(grid_rf, X_test, y_test)

lnr = IAI.get_learner(grid_rf)
IAI.write_json("models/rf.json", lnr)

MSE: 0.2304


5566740

In [55]:
grid_xgb = IAI.GridSearch(
    IAI.XGBoostRegressor(
        random_seed=seed,
    ),
    max_depth=[2,4,6,8],
    minbucket = [0.01, 0.02, 0.05],
    num_estimators=[20,25,50,100],
)

IAI.fit_cv!(grid_xgb, X_train, y_train, validation_criterion=:mse, n_folds=5)
get_mse(grid_xgb, X_test, y_test)

lnr = IAI.get_learner(grid_xgb)
IAI.write_json("models/xgb.json", lnr)

MSE: 0.7932


831879

In [46]:
# for conversion value

get_mse(grid_lr, X_test, y_test)
get_mse(grid_oct, X_test, y_test)
get_mse(grid_rf, X_test, y_test)
get_mse(grid_xgb, X_test, y_test)

MSE: 0.0051
MSE: 0.0157
MSE: 0.0211
MSE: 0.0337


0.03372427106533249

In [56]:
# for clicks

get_mse(grid_lr, X_test, y_test)
get_mse(grid_oct, X_test, y_test)
get_mse(grid_rf, X_test, y_test)
get_mse(grid_xgb, X_test, y_test)

MSE: 0.187
MSE: 0.3459
MSE: 0.2304
MSE: 0.7932


0.7932226429863849