In [None]:
new_bid = {
    'total_impressions': 16, # Measurement column measuring the impressions for the particular set of dimensions
    'viewable_impressions': 2, # Number of impressions on the site that were viewable out of all measurable impressions. A display ad is counted as viewable if at least 50% of its area was displayed on screen for at least one second
    'measurable_impressions': 16, # Impressions that were measurable by Active View out of the total number of eligible impressions. This value should generally be close to 100%.
    'revenue_share_percent': 1, # This is the fraction of revenue that will actually go to the publisher.
    
    # Other characteritics not so important
    'site_id': 400, # Each ID is a different website in the auction. For this test, let's pretend we want ads for Id 400
    'ad_type_id': 10, # Type of Ad (video, image, etc...), for now, let's use image (10)
    'geo_id': 187, # Each ID is a different country. 187 is english-speaking anglosphere (UK, Canada and USA)
    'device_category_id': 2, # Each ID represents a device. 2 is Mobile
    'advertiser_id': 1234, # Each ID is a different bidder in the auction. We can just type whatever we want
    'order_id':  3473, # This id is not very useful for now.
    'line_item_type_id': 19,# This id is also not very useful for now.
    'os_id': 60, # Each ID is an operational system. I've set it for the most common for mobile: Android
    'integration_type_id': 1, 
    'monetization_channel_id': 4, 
    'ad_unit_id': 5174,

}


google_strategies = [
"MANUAL_CPC: Manual click based bidding where user pays per click.",
"MANUAL_CPM: Manual impression based bidding where user pays per thousand impressions. This can only be used for Display Network only campaigns.",
"TARGET_CPA: Target cost per acquisition based bidding that automatically optimizes conversions per dollar.",
"TARGET_SPEND: Bidding strategy that automatically optimizes clicks per dollar.",
"TARGET_ROAS: Bidding strategy that automatically maximizes revenue while averaging a specific target Return On Average Spend (ROAS).",
"MAXIMIZE_CONVERSIONS: Bidding strategy that automatically maximizes number of conversions given a daily budget.",
"MAXIMIZE_CONVERSION_VALUE: Bidding strategy that automatically maximizes the total conversion value of your campaign within a specified budget.",
"TARGET_IMPRESSION_SHARE: Bidding strategy that automatically sets bids with the goal of showing your ad on the absolute top of the page, on the top of the page, or anywhere on the page of Google search results."
]

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os 
import seaborn as sns
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')
df['date'] = df.date.apply(lambda l: pd.Timestamp(l).value)
df['date'] = df.date.apply(lambda l: pd.Timestamp(l).value)
def not_zero_division(n, d):
    return n / d if d else 0
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = ((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000
df['CPM'] = df.apply(lambda x: not_zero_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)
del df['total_revenue']
get_strategy = (new_bid['total_impressions'] - new_bid['measurable_impressions'])//new_bid['viewable_impressions']
get_strategy = min(max(0,get_strategy),7)
# Remove total impressions as well as that is account the same information as measurable impressions also let us try to see if viewable/measurable impressions are corellated to revenue or not 
df['View/measurable'] = df.apply(lambda x: not_zero_division(x['viewable_impressions'],x['measurable_impressions']) , axis=1)
# Remove outliers
df = df[df.CPM >= 0]
df = df[df.CPM < df.CPM.quantile(.95)]
df.reset_index(inplace= True)


X_cols = ['total_impressions', 'viewable_impressions', 'measurable_impressions', 'revenue_share_percent',
          'site_id', 'ad_type_id', 'geo_id', 'device_category_id', 'advertiser_id', 'order_id',
          'line_item_type_id', 'os_id', 'integration_type_id', 'monetization_channel_id', 'ad_unit_id']
y_cols =['CPM']

X_train = df.loc[df.date < pd.Timestamp('06-22-2019').value][X_cols]
y_train = df.loc[df.date < pd.Timestamp('06-22-2019').value][y_cols]
X_test = df.loc[df.date >= pd.Timestamp('06-22-2019').value][X_cols]
y_test = df.loc[df.date >= pd.Timestamp('06-22-2019').value][y_cols]

X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=False, random_state=1)
prediction_xgb_cv = np.zeros(X_test.shape[0])

xgbr = xgb.XGBRegressor(n_estimators=230, learning_rate=0.06, gamma=0, subsample=0.75,#n_estimators=100 #learning_rate=0.08
                           colsample_bytree=0.6, max_depth=9, random_state=1)

for fold_n, (train_index, test_index) in enumerate(folds.split(X_train)):
    print('Fold:', fold_n)
    X_traincv, X_validcv = X_train.iloc[train_index], X_train.iloc[test_index]
    Y_traincv, Y_validcv = y_train.iloc[train_index], y_train.iloc[test_index]

    xgbr.fit(X_traincv.values,Y_traincv.values)
    
    y_pred = xgbr.predict(X_test.values)
    prediction_xgb_cv += y_pred
    
    print(mean_squared_error(y_test, y_pred))
    
prediction_xgb_cv /= n_fold
print('--------------')
mean_squared_error(y_test, prediction_xgb_cv)
prediction_xgb_cv[prediction_xgb_cv<0] = 0
mean_squared_error(y_test, prediction_xgb_cv)
print("Reccomended strategy:")
print(google_strategies[get_strategy])