In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install TPOT

In [None]:
from tpot import TPOTRegressor

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
df.describe()

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
df = df[df.CPM>=0].reset_index(drop=True)
df = df[df['CPM']<df['CPM'].quantile(0.95)].reset_index(drop=True)
test = df[pd.to_datetime(df.date) >= pd.to_datetime('2019-06-22')].reset_index(drop=True)
train = df[pd.to_datetime(df.date) < pd.to_datetime('2019-06-22')].reset_index(drop=True)

In [None]:
cols_to_drop = ['date', 'total_revenue', 'measurable_impressions', 'viewable_impressions', 'revenue_share_percent', 'total_impressions']
train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
# Extract features and labels
train_y = train['CPM']
train_X = train.drop('CPM', axis = 1)

# Training and Testing Sets
test_X = test.drop('CPM', axis = 1)
test_y = test['CPM']

train_X = np.array(train_X)
test_X = np.array(test_X)
train_y = np.array(train_y)
test_y = np.array(test_y)

train_X.shape, test_X.shape

## Linear Model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error 
from sklearn import linear_model

lasso_model = linear_model.Lasso()
cv_res = cross_val_score(lasso_model, train_X, train_y,
                          scoring=make_scorer(mean_squared_error), cv=10);
cv_res

In [None]:
lasso_model.fit(train_X, train_y)

In [None]:
predictions_lasso = lasso_model.predict(test_X)
print("MSE equal to: ", mean_squared_error(predictions_lasso, test_y))

## Boosting

In [None]:
from catboost import CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=1000,
                          learning_rate=0.5,
                          depth=6,
                          l2_leaf_reg=3,
                          loss_function='RMSE')

In [None]:
catboost_model.fit(train_X, train_y)

In [None]:
predictions = catboost_model.predict(test_X)
print("MSE equal to: ", mean_squared_error(predictions, test_y))

## AutoML (TPOT)

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score # to split the data
from sklearn.metrics import explained_variance_score, median_absolute_error, r2_score, mean_squared_error #To evaluate our model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split # Model evaluation
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  # Ensemble methods
from xgboost import XGBRegressor, plot_importance # XGBoost
from sklearn.svm import SVR, SVC, LinearSVC  # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone # Clone estimator
from sklearn.metrics import mean_squared_error as MSE

In [None]:
tpot_config = {
    'sklearn.ensemble.GradientBoostingRegressor': {
        ''
    },
    'xgboost.XGBRegressor': {
        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
        'fit_prior': [True, False]
    },
    'sklearn.naive_bayes.MultinomialNB': {
        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
        'fit_prior': [True, False]
    }
}

In [None]:
tpot = TPOTRegressor(verbosity=2, scoring='neg_mean_squared_error', cv=3, 
                      n_jobs=-1, generations=6, config_dict='TPOT light',
                      population_size=50, random_state=3,
                      early_stop = 5)

In [None]:
tpot.fit(train_X, train_y)

In [None]:
predictions_tpot = tpot.predict(test_X)
print("MSE equal to: ", mean_squared_error(predictions_tpot, test_y))

# Final results

In [None]:
res = pd.DataFrame({'model':['Lasso', 'TPOT', 'CatBoost'], 'Value':[7902.471763714017, 3659.890475848824, 3185.3504729663064]})
ax = res.plot.bar(x='model', y='Value', rot=0)