In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import regularizers
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
#remove total revenue and measurable impressions since they are used to calculate the target metric
df.drop(['total_revenue'], axis = 1, inplace=True)

In [None]:
print("The dataset has {} rows and {} columns.".format(*df.shape))
print("It contains {} duplicates.".format(df.duplicated().sum()))

In [None]:
df.CPM.describe()

## Process categorial features

In [None]:
df.nunique()

In [None]:
categorial_features = ['site_id', 'ad_type_id', 'geo_id', 'device_category_id', 'advertiser_id', 'os_id',
                       'integration_type_id', 'monetization_channel_id', 'ad_unit_id']

In [None]:
for col in categorial_features:
    df[col] = df[col].astype('category')

In [None]:
num_feats = df.select_dtypes(include=['float64', 'int64', 'bool', 'object']).copy()

# one-hot encoding of categorical features
cat_feats = df.select_dtypes(include=['category']).copy()
cat_feats = pd.get_dummies(cat_feats)
cat_feats = cat_feats[cat_feats.columns[(cat_feats.sum() > 5000).values]]

In [None]:
features_recoded = pd.concat([num_feats, cat_feats], axis=1)

In [None]:
features_recoded.head()

In [None]:
print("The dataset has {} rows and {} columns.".format(*features_recoded.shape))

## Split into train and test datasets (test set will be used in the end)

In [None]:
features_recoded['date'] = pd.to_datetime(df['date'])

Date cut-off:

In [None]:
test_df = features_recoded[features_recoded.date >= pd.to_datetime('2019-06-22')]

Apply a 95%-condition and a non-negative value condition on a test set

In [None]:
test_df = test_df[test_df['CPM'] >= 0]
test_df = test_df[test_df['CPM'] < test_df['CPM'].quantile(0.95)]

## Set aside test_X and test_y dataframes for later use (to get final results after CV)

In [None]:
test_X = test_df.drop(['date', 'CPM'], axis = 1)
test_y = test_df.CPM

## Now get train data set

In [None]:
train_df = features_recoded[features_recoded.date < pd.to_datetime('2019-06-22')]

Clean train set

In [None]:
train_df = train_df[train_df['CPM'] >= 0]
train_df = train_df[train_df['CPM'] < train_df['CPM'].quantile(0.95)]

In [None]:
train_df_features = train_df.drop(['date', 'CPM'], axis = 1)
train_target = train_df.CPM

# CV

In [None]:
# import train_test_split function
from sklearn.model_selection import train_test_split
# import metrics
from sklearn.metrics import mean_squared_error, r2_score

# split our data
train_X_train, train_X_test, train_y_train, train_y_test = train_test_split(train_df_features, train_target, test_size = 0.3)

In [None]:
# scale data
sc = StandardScaler()
X_train = sc.fit_transform(train_X_train)
X_test  = sc.transform(train_X_test)

In [None]:
# fix random seed for reproducibility
seed = 15
np.random.seed(seed)

print(tf.__version__)

linear_model = tf.keras.Sequential([
    layers.Dense(units=700, activation='relu', kernel_regularizer=regularizers.l2(5)),
    layers.Dropout(0.5),
    layers.Dense(units=150, activation='relu', kernel_regularizer=regularizers.l2(1)),
    layers.Dropout(0.5),
    layers.Dense(units=1, activation='linear')
])

keras.backend.set_epsilon(0.01)
linear_model.compile(
    optimizer=tf.optimizers.Adam(),
    loss='mean_squared_error')

history = linear_model.fit(
    X_train, train_y_train,
    epochs=150,
    # suppress logging
    verbose=2,
    # Calculate validation results on 20% of the training data
    use_multiprocessing=True,
    validation_split=0.2
    ,batch_size=2048
)

y_hat = linear_model.predict(X_test).flatten()
print("Score on cross-validation is ", mean_squared_error(y_hat, train_y_test))

In [None]:
def plot_loss(model_history):
    train_loss=[value for key, value in model_history.items() if 'loss' in key.lower()][0]
    valid_loss=[value for key, value in model_history.items() if 'loss' in key.lower()][1]
    fig, ax1 = plt.subplots()
    color = 'tab:blue'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss', color=color)
    ax1.plot(train_loss, '--', color=color, label='Train Loss')
    ax1.plot(valid_loss, color=color, label='Valid Loss')
    ax1.tick_params(axis='y', labelcolor=color)
    plt.legend(loc='upper left')
    plt.title('Model Loss')
    plt.show()
    
plot_loss(history.history)

In [None]:
print("Train set score is ", mean_squared_error(y_hat, train_y_test))

## Get predictions and compute MSE on deferred test

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(train_df_features)
X_test  = sc.transform(test_X)

In [None]:
seed = 15
np.random.seed(seed)

print(tf.__version__)

linear_model = tf.keras.Sequential([
    layers.Dense(units=700, activation='relu', kernel_regularizer=regularizers.l2(5)),
    layers.Dropout(0.5),
    layers.Dense(units=150, activation='relu', kernel_regularizer=regularizers.l2(1)),
    layers.Dropout(0.5),
    layers.Dense(units=1, activation='linear')
])

keras.backend.set_epsilon(0.01)
linear_model.compile(
    optimizer=tf.optimizers.Adam(),
    loss='mean_squared_error')


history = linear_model.fit(
    X_train, train_target, 
    epochs=150,
    # suppress logging
    verbose=2,
    # Calculate validation results on 20% of the training data
    use_multiprocessing=True,
    validation_split=0.2
    ,batch_size=2048
)

In [None]:
y_hat = linear_model.predict(X_test).flatten()
print("Test Set MSE is ", mean_squared_error(y_hat, test_y))