In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import model_selection
from sklearn import metrics
import xgboost as xgb
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
PATH = "/kaggle/input/real-time-advertisers-auction/"

In [None]:
TRAIN_DATA = datetime(2019,6,21)

In [None]:
data = pd.read_csv(PATH+'Dataset.csv', parse_dates=['date'])

In [None]:
data.head()

In [None]:


def weird_division(n, d):
    return n / d if d else 0

data['CPM'] = data.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
data = data.loc[data['CPM']>=0]

In [None]:
data.describe()

In [None]:
data = data.loc[data['CPM']<data.CPM.quantile(q = 0.95)]

In [None]:
data_train = data.loc[data['date']>=TRAIN_DATA]
data_test = data.loc[data['date']<TRAIN_DATA]
X_train = data_train.drop(['CPM'], axis = 1)
y_train = data_train['CPM']
X_test = data_test.drop(['CPM'], axis = 1)
y_test  = data_test['CPM']

In [None]:
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

In [None]:
features = ['total_impressions', 'viewable_impressions',
       'measurable_impressions', 'revenue_share_percent','site_id', 'ad_type_id', 'geo_id', 'device_category_id',
       'advertiser_id', 'order_id', 'line_item_type_id', 'os_id',
       'integration_type_id', 'monetization_channel_id', 'ad_unit_id',]

In [None]:
pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(features) ),
                                                                   
                                   ( 'std_scaler', StandardScaler() ) ,
                               ('model',xgb.XGBRegressor(learning_rate = 0.1, max_depth = 20, n_estimators = 20,random_state=42))
                                  ] )
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
metrics.mean_squared_error(y_test,y_pred)

In [None]:
pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(features) ),
                                                                                                     
                                   ( 'std_scaler', StandardScaler() ) ,
                                    ('model',RandomForestRegressor()) 
                                 ] )

In [None]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
metrics.mean_squared_error(y_test,y_pred)