In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
df.head()

# Calculating and analysis of CPM

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division((x['total_revenue'] * 100), x['measurable_impressions']) * 1000 , axis=1)

In [None]:
df['CPM'].mean()

In [None]:
import matplotlib.pyplot as plt

df.plot(x='date', y='CPM')

will remove the column 'total_revenue' as it directly affects the results. also we don't need a column 'date'

In [None]:
columns_to_remove = ['date', 'total_revenue']

X_train = df.loc[df['date'] <= '2019-06-21'].drop(columns=columns_to_remove)
X_train = X_train[(X_train['CPM'] >= 0) & (X_train['CPM'] <= X_train['CPM'].quantile(.95))]
y_train = X_train['CPM']
X_train.drop(columns=['CPM'], inplace=True)

X_test = df.loc[df['date'] > '2019-06-21'].drop(columns=columns_to_remove)
X_test = X_test[(X_test['CPM'] >= 0) & (X_test['CPM'] <= X_test['CPM'].quantile(.95))]
y_test = X_test['CPM']
X_test.drop(columns=['CPM'], inplace=True)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

xgb_start = XGBRegressor(random_state=0)
xgb_start.fit(X_train, y_train, eval_metric='rmse')
pred = xgb_start.predict(X_test)
mean_squared_error(y_test, pred)

In [None]:
def get_feature_importance(clsf, ftrs):
    imp = clsf.feature_importances_.tolist()
    feat = ftrs
    result = pd.DataFrame({'feat':feat,'score':imp})
    result = result.sort_values(by=['score'],ascending=False)
    return result

get_feature_importance(xgb_start, X_train.columns)

Apparently, the columns 'measurable_impressions', 'order_id' , 'line_item_type_id' are some leaks of the data, so the author of the initial solution removed them. Will remove it too. 

the columns 'integration_type_id' , 'revenue_share_percent have no effect but the author of the initial solution removed them. 

In [None]:
columns_to_remove = ['date', 'total_revenue', 'measurable_impressions', 'order_id' , 'line_item_type_id', 'integration_type_id' , 'revenue_share_percent']

X_train = df.loc[df['date'] <= '2019-06-21'].drop(columns=columns_to_remove)
X_train = X_train[(X_train['CPM'] >= 0) & (X_train['CPM'] <= X_train['CPM'].quantile(.95))]
y_train = X_train['CPM']
X_train.drop(columns=['CPM'], inplace=True)

X_test = df.loc[df['date'] > '2019-06-21'].drop(columns=columns_to_remove)
X_test = X_test[(X_test['CPM'] >= 0) & (X_test['CPM'] <= X_test['CPM'].quantile(.95))]
y_test = X_test['CPM']
X_test.drop(columns=['CPM'], inplace=True)

In [None]:
xgb = XGBRegressor(random_state=0)
xgb.fit(X_train, y_train, eval_metric='rmse')
pred = xgb.predict(X_test)
mean_squared_error(y_test, pred)

In [None]:
get_feature_importance(xgb, X_train.columns)

# Find best models

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from lightgbm import LGBMRegressor

In [None]:
models = [
          GradientBoostingRegressor(random_state=0), 
          XGBRegressor(random_state=0), 
          RandomForestRegressor(random_state=0), 
          LGBMRegressor(random_state=0), 
          Ridge(random_state=0)
          ]

In [None]:
for model in models:
    try:
        model.fit(X_train, y_train, eval_metric='rmse')
    except:
        model.fit(X_train, y_train)
    pred = model.predict(X_test)
    model_name = type(model).__name__
    print(f'{model_name} - MAE {mean_squared_error(y_test, pred)}')

# Prepare Ensemble of best models

In [None]:
pred = None
models = [
          XGBRegressor(random_state=0), 
          RandomForestRegressor(random_state=0), 
          LGBMRegressor(random_state=0), 
          ]
for model in models:
    try:
        model.fit(X_train, y_train, eval_metric='rmse')
    except:
        model.fit(X_train, y_train)

for i in range(len(models)):
    if pred is None: 
        pred = models[i].predict(X_test)
    else:
        pred += models[i].predict(X_test)

print(mean_squared_error(y_test, pred / 3))

# Search best params for models

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
xg_reg = XGBRegressor(random_state=0)
params = {
        'max_depth': [5, 10, 15],
        'n_estimators': [50, 75, 100]
        }

grid_search = GridSearchCV(xg_reg, params, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
xgb = XGBRegressor(random_state=0, max_depth=10, n_estimators=75)
xgb.fit(X_train, y_train, eval_metric='rmse')
pred = xgb.predict(X_test)
mean_squared_error(y_test, pred)

In [None]:
rf_reg = RandomForestRegressor(random_state=0)
params = {
        'max_depth': [15, 30, 25],
        'n_estimators': [150, 200]
        }

grid_search_rf = GridSearchCV(rf_reg, params, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_search_rf.fit(X_train, y_train)

In [None]:
rf = RandomForestRegressor(random_state=0, max_depth = 25, n_estimators = 200)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
mean_squared_error(y_test, pred)

In [None]:
lgbm_reg = LGBMRegressor(random_state=0)
params = {
        'max_depth': [15, 50, 100],
        'n_estimators': [200, 500, 1000]
        }

grid_search_lgbm = GridSearchCV(lgbm_reg, params, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_search_lgbm.fit(X_train, y_train)

In [None]:
lgbm = LGBMRegressor(random_state=0, max_depth = 50, n_estimators = 1000)
lgbm.fit(X_train, y_train, eval_metric='rmse')
pred = lgbm.predict(X_test)
mean_squared_error(y_test, pred)

# Voiting

In [None]:
from sklearn.ensemble import VotingRegressor
VotReg = VotingRegressor(estimators=[('rf', rf), ('xgb', xgb), ('lgbm', lgbm),])
VotReg.fit(X_train, y_train)

In [None]:
pred = VotReg.predict(X_test)
mean_squared_error(y_test, pred)

of course, more detailed investigation will improve the result