# 1. Importing Libraries

In [None]:
# Add required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

#from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score

import optuna
seed =42 # for repeatability
%matplotlib inline


# 2. Getting Data

In [None]:
# Load data set
# Add path for the data set
path = "../input/real-time-advertisers-auction/Dataset.csv"
df = pd.read_csv(path)
Xact = df

# 3. Understanding Data

In [None]:
df.head() 

In [None]:
df.shape

In [None]:
# converting date column to datetime 
df['date'] = pd.to_datetime(df['date'],errors='coerce')

# 4. Data cleaning

In [None]:
 df.isnull().sum() # Check the missing elements

# 5. Data analysis

In [None]:
df.info() # Check the data type of the variables

In [None]:
# checking for duplicate rows
print(df.shape[0])
print(f'Number of duplicated rows: {df.shape[0] - df.drop_duplicates().shape[0]}')
print('dropping duplicates')
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
# Add varibles
df['weekday'] = df['date'].apply(lambda x:x.weekday())
df['View_perc'] = np.where(df['measurable_impressions']!=0,
                           df['viewable_impressions']/df['measurable_impressions'],0)

In [None]:
df.nunique() # Count Distinct Values

# 6. Features selection

In [None]:
# We can infer from above that these columns only have one unqique value
# so dropping it
df = df.drop(['integration_type_id','revenue_share_percent'],axis=1)

In [None]:
# Since it is mentioned in the problem description dropping columns as below
df = df.drop(['order_id', 'line_item_type_id'],axis=1)

In [None]:
# Checking if there are any categorical/object column?
# All are either float or ints
print(df.select_dtypes(['object']).columns)

Above analysis explained:
1. order_id and line_item_type_id are dropped as mentioned in problem formulation

2. integration_type_id and revenue_share_percent are dropped since both have only one unique category

3. Catagorical variables (Which have unique values less than 12): 

    site_id, ad_type_id, device_category_id,       line_item_type_id,os_id, integration_type_id,monetization_channel_id, revenue_share_percent.

4. Data types in the data set are 'Object', 'integer' and 'float'

5. No missing values in the given features 

In [None]:
df.shape

CPM (Cost per thousand impressions) calculation 
+ CPM = (revenue/impressions)*1000

In [None]:
df['CPM'] = np.where(df['measurable_impressions']!=0, (df['total_revenue']*100/df['measurable_impressions'])*1000,0)
df_corr = df # df for correlation analysis

In [None]:
df['CPM'].value_counts()

In [None]:
# Since we used 'total_revenue' and 'measurable inpressions' to calculate 'CPM' 
# we need to drop any one the feature
# so dropping total_revenue
df = df.drop(['total_revenue'],axis=1)
df_fortest = df

In [None]:
df.shape

# 7. Modelling

7.1 Approach 1 : ML model - XGBoost
    * Modelling without removing outliers

In [None]:
# Modelling - Iteration 1
# ML model

y = df['CPM']
X = df.drop(columns=['date',"CPM"]) # For modelling


Training and testing data selection
    * Selection based on date 06-22-2019

In [None]:
train_X = X.loc[df.date < pd.Timestamp('06-22-2019')]
val_X = X.loc[df.date >= pd.Timestamp('06-22-2019')]
train_y = y.loc[df.date < pd.Timestamp('06-22-2019')]
val_y = y.loc[df.date >= pd.Timestamp('06-22-2019')]

In [None]:
train_X.shape, train_y.shape, val_X.shape, val_y.shape

XGBoost model fitting

In [None]:
model_xgb1 = xgb.XGBRegressor()
model_xgb1.fit(train_X, train_y)
xgb_preds = model_xgb1.predict(val_X)

Error metric evaluation

In [None]:
print("MAE:", mean_absolute_error(val_y, xgb_preds))
print("MSE:", mean_squared_error(val_y, xgb_preds))
print("R2:",r2_score(val_y,xgb_preds))

As it can be seen above metrics are bad, we need to explore more to understand what might be going wrong
1. Lets look at the 'CPM' variable distribution

In [None]:
# Distribution of CPM
sns.distplot(df['CPM'],bins=1000)
plt.ylabel("Density")
plt.ylim((0,0.000001))

In [None]:
df['CPM'].value_counts()

As it can be seen from above distribtuon plot and value counts,that there are plenty of zeros and as well as outliers as fars as till ~3k. Let us remove outlier first from data and see if metrics improves

7.2 Approach 2 : ML model - XGBoost\
    * Model after removing outliers

Outliers removel

In [None]:
# Remove outliers 
df_1 = df[df.CPM < df.CPM.quantile(.95)]
df_1 = df_1[df_1.CPM >= 0]

In [None]:
ax=sns.distplot(df_1['CPM'], kde_kws={"color": "k", "lw": 3, "label": "KDE"})
plt.ylabel("Density")
#plt.ylim((0,0.000001))

In [None]:
# For modelling
X = df_1.drop(columns=['date',"CPM"]) 
y = df_1['CPM']

Training and testing points
    * Selection based on date 06-22-2019

In [None]:
# train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = seed)
train_X = X.loc[df_1.date < pd.Timestamp('06-22-2019')]
val_X = X.loc[df_1.date >= pd.Timestamp('06-22-2019')]
train_y = y.loc[df_1.date < pd.Timestamp('06-22-2019')]
val_y_o = y.loc[df_1.date >= pd.Timestamp('06-22-2019')]

XGBoost model fitting

In [None]:
# Model fitting using XGBoost
model_xgb2 = xgb.XGBRegressor()
model_xgb2.fit(train_X, train_y)
xgb_preds_noout = model_xgb2.predict(val_X)

Error metric evaluation

In [None]:
print("MAE:", mean_absolute_error(val_y_o, xgb_preds_noout))
print("MSE:", mean_squared_error(val_y_o, xgb_preds_noout))
print("R2:",r2_score(val_y_o,xgb_preds_noout))

Model performance did improve a bit, it seems that we need to deal with zeros first as it can be reason that heavy tail making our model biased towards it
1. We are going to use two staged approach here
2. Stage 1: Build model to classify if the CPM is zero or not
3. Stage 2: regression model to regress for predicted non zero class
4. Ensemble this to get final predictions

7.3 Approach 3 : ML model - XGBoost\
    * Modelling after removing outliers\
    * Stage 1: Build model to classify if the CPM is zero or not\
    * Stage 2: regression model to regress for predicted non zero class\
    * Ensemble this to get final predictions\

Classifying CPM data

In [None]:
df['CPM_cat'] = np.where(df["CPM"]>0,1,0)

In [None]:
df['CPM_cat'].value_counts()

We can use accuracy as metric as it can be seen that the classes are nearly balanced

In [None]:
y_clf = df['CPM_cat']
X_clf = df.drop(columns=['date',"CPM",'CPM_cat']) # For modelling

Training and testing points
    * Selection based on date 06-22-2019

In [None]:
train_X_clf = X_clf.loc[df.date < pd.Timestamp('06-22-2019')]
val_X_clf = X_clf.loc[df.date >= pd.Timestamp('06-22-2019')]
train_y_clf = y_clf.loc[df.date < pd.Timestamp('06-22-2019')]
val_y_clf = y_clf.loc[df.date >= pd.Timestamp('06-22-2019')]

XGBoost model fitting

In [None]:
model_xgb_clf = xgb.XGBClassifier()
model_xgb_clf.fit(train_X_clf, train_y_clf)
xgb_preds_clf = model_xgb_clf.predict(val_X_clf)
print('acc',accuracy_score(val_y_clf,xgb_preds_clf))

Hyper parameter optimization
     for classified CPM data

In [None]:
# Function used in hyper parameter optimization with the parameters

def objective(trial):
  

    param = {
        "silent": 1,
        "objective": "binary:logistic",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "max_depth":trial.suggest_int("max_depth",4,10),
        "colsample_bytree": trial.suggest_float("colsample_bytree",.6,1.),
        "subsample":trial.suggest_float("subsample",.7,1.),

    }

    y_clf = df['CPM_cat']
    X_clf = df.drop(columns=['date',"CPM",'CPM_cat']) # For modelling
    
    # train_X_clf, val_X_clf, train_y_clf, val_y_clf = train_test_split(X_clf, y_clf, random_state = seed)
    train_X_clf = X_clf.loc[df.date < pd.Timestamp('06-22-2019')]
    val_X_clf = X_clf.loc[df.date >= pd.Timestamp('06-22-2019')]
    train_y_clf = y_clf.loc[df.date < pd.Timestamp('06-22-2019')]
    val_y_clf = y_clf.loc[df.date >= pd.Timestamp('06-22-2019')]

    model_xgb_clf = xgb.XGBClassifier(**param)
    model_xgb_clf.fit(train_X_clf, train_y_clf)
    xgb_preds_clf = model_xgb_clf.predict(val_X_clf)
    print('acc',accuracy_score(val_y_clf,xgb_preds_clf))


    return accuracy_score(val_y_clf,xgb_preds_clf)

In [None]:
# Parameters after hyper parameter optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 5)

Fit model for classifier

In [None]:
model_xgb_clf = xgb.XGBClassifier(**study.best_params)
model_xgb_clf.fit(train_X_clf, train_y_clf)
xgb_preds_clf = model_xgb_clf.predict(val_X_clf)
print('acc',accuracy_score(val_y_clf,xgb_preds_clf))

In [None]:
X.shape,df.shape

Oulier removal

In [None]:
df = df[df['CPM']<np.percentile(df['CPM'],95)]
df = df[df['CPM']>=0]

In [None]:
df.shape

In [None]:
df['CPM_cat_pred'] = list(model_xgb_clf.predict(df[X.columns]))

In [None]:
df['CPM_cat_pred'].shape

Model fitting after hyper parameter optimization and outlier removal

In [None]:
y_reg = df['CPM']
X_reg = df.drop(columns=['date',"CPM",'CPM_cat']) # For modelling

Training and testing points
    * Selection based on date 06-22-2019

In [None]:
train_X_reg = X_reg.loc[df.date < pd.Timestamp('06-22-2019')]
val_X_reg = X_reg.loc[df.date >= pd.Timestamp('06-22-2019')]
train_y_reg = y_reg.loc[df.date < pd.Timestamp('06-22-2019')]
val_y_reg = y_reg.loc[df.date >= pd.Timestamp('06-22-2019')]

Training and testing points\
    * Selection based on date 06-22-2019

XGBoost model fitting

In [None]:
model_xgb_reg = xgb.XGBRegressor()
model_xgb_reg.fit(train_X_reg, train_y_reg)
xgb_preds_reg = model_xgb_reg.predict(val_X_reg)

print("MAE:", mean_absolute_error(val_y_reg, xgb_preds_reg))
print("MSE:", mean_squared_error(val_y_reg, xgb_preds_reg))
print("R2:",r2_score(val_y_reg,xgb_preds_reg))

7.4 Approach 4\
     * Feature seletion based on correlation analysis 

Correlation analysis - using heatmap 

In [None]:
# Correalation between each variables 
df_fortest = df_corr.drop(['total_revenue'],axis=1)
plt.figure(figsize=(16,8))
sns.heatmap(df_fortest.corr(),annot = True,cmap="YlGnBu",fmt='.1g')

# Inference
# Total and viewable are are highly correlated(>0.85) with total impressions
# Total and viewable are dropped from the data set

df_fortest = df_fortest.drop(['measurable_impressions','viewable_impressions'],axis=1)
df_fortest.shape

Model selction

In [None]:
y_reg_cor = df_fortest['CPM']
X_reg_cor = df_fortest.drop(columns=['date','CPM']) # For modelling

Training and testing points
    * Selection based on date 06-22-2019

In [None]:
train_X_reg_cor = X_reg_cor.loc[df_fortest.date < pd.Timestamp('06-22-2019')]
val_X_reg_cor = X_reg_cor.loc[df_fortest.date >= pd.Timestamp('06-22-2019')]
train_y_reg_cor = y_reg_cor.loc[df_fortest.date < pd.Timestamp('06-22-2019')]
val_y_reg_cor = y_reg_cor.loc[df_fortest.date >= pd.Timestamp('06-22-2019')]

XGBoost model fitting

In [None]:
model_xgb_reg_cor = xgb.XGBRegressor()
model_xgb_reg_cor.fit(train_X_reg_cor, train_y_reg_cor)
xgb_preds_reg_cor = model_xgb_reg_cor.predict(val_X_reg_cor)

print("MAE:", mean_absolute_error(val_y_reg_cor, xgb_preds_reg_cor))
print("MSE:", mean_squared_error(val_y_reg_cor, xgb_preds_reg_cor))
print("R2:",r2_score(val_y_reg_cor,xgb_preds_reg_cor))

Further study is not required as mentioned in approach 2 (removal of outliers) and approach 3 ( Ensembling regression and classification after Hyper parameter optimization)\
Since this approach gives similar error metrics compared to approach 1. Upgrading this will provide outcomes similar to approach 2 and approach 3. So the further study is ignored.

# Model selection

In [None]:
# R2 error comparison
print("R2_1:",r2_score(val_y,xgb_preds))
print("R2_2:",r2_score(val_y_o,xgb_preds_noout))
print("R2_3:",r2_score(val_y_reg,xgb_preds_reg))
print("R2_4:",r2_score(val_y_reg_cor,xgb_preds_reg_cor))

In [None]:
# MSE error comparison
print("MSE_1:", mean_squared_error(val_y, xgb_preds))
print("MSE_2:", mean_squared_error(val_y_o, xgb_preds_noout))
print("MSE_3:", mean_squared_error(val_y_reg, xgb_preds_reg))
print("MSE_4:", mean_squared_error(val_y_reg_cor, xgb_preds_reg_cor))

In [None]:
# MAE error
print("MAE_1:", mean_absolute_error(val_y, xgb_preds))
print("MAE_2:", mean_absolute_error(val_y_o, xgb_preds_noout))
print("MAE_3:", mean_absolute_error(val_y_reg, xgb_preds_reg))
print("MAE_4:", mean_absolute_error(val_y_reg_cor, xgb_preds_reg_cor))

Based on the error metric model from approach 3 is choosed as best fit to the data.
For example, MAE error is reduced around 41%\
Apporch 3 consists: \
    * Model bulit with XGBoost ML algorithm\
    * Outliers are not considered in CPM\
    * Hyper parameter optimization is carried out to identify the best parameter combinations for XGBoost\
    * Error metric comparison with approach 1

In [None]:
best_df= pd.DataFrame({})
best_df['CPM_actual']= val_y_reg

In [None]:
best_df['Pred_CPM'] = xgb_preds_reg

Revenue calculation

In [None]:
revenue_df = pd.DataFrame({'Actual_Impressions': val_X_reg['total_impressions'].values,  'CPM_actual': val_y_reg, 
                           'Pred_CPM': best_df['Pred_CPM'].values})

revenue_df['Pred_Rev'] = revenue_df['Pred_CPM'] * revenue_df['Actual_Impressions'] / (1000 * 100)
revenue_df['Pred_Rev'] = revenue_df['Pred_Rev'].clip(lower=0)
revenue_df.sample(n=5)

# Questions

1. What is the potential revenue range our publisher can make in July?

In [None]:
print('Average revenue of june month:', np.round(Xact["total_revenue"].mean(),2))
P =np.round(revenue_df["Pred_Rev"].mean(),2)
print('Predicted revenue range of july month:', P)

2. What is the reserve prices that he/she can set ?

In [None]:
print('Reserve price of june month:', np.round(best_df["CPM_actual"].max(),2))
print('Predicted reserve price of july month:',np.round(best_df["Pred_CPM"].max(),2))