In [None]:
import os
from warnings import filterwarnings

from matplotlib.pyplot import figure
from pandas import read_csv
from seaborn import heatmap
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from tqdm.notebook import tqdm


filterwarnings("ignore")
tqdm.pandas()

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# *Support class and functions*

In [None]:
class FreqEncoder:
    """Simple encoding categorical features by frequency."""
    
    def __init__(self):
        self._freqs = None
            
    def _freq_encoding(self, data, column):
        """Use frequency as encoding value"""
        self._freqs[column] = data.groupby(column).size() / len(data)
    
    def fit(self, data):
        """Compute freq for each columns"""
        self._freqs = {}
        for column in data.columns:
            freq_encoding(data)
    
    def transform(self, data):
        """Transform columns"""
        if not self._freqs:
            raise ValueError("Encoder didn't fit.\n")
        for column in data.columns:
            data[column] = data[column].map(self._freqs[column])
        return data
    
    def fit_transform(self, data):
        """Compute freq and transform for each columns"""
        self._freqs = {}
        for column in data.columns:
            self._freq_encoding(data, column)
            data[column] = data[column].map(self._freqs[column])
        return data
    

In [None]:
def weird_division(num, divisor):
    """Function form unique and resplendent notebook."""
    return num / divisor if divisor else 0


def prepcoress_data(data, drop_columns):
    """Create target, drop unnecessary columns."""
    data["viewable/measurable"] = data.progress_apply(
        lambda row: weird_division(row["viewable_impressions"], row["measurable_impressions"]),
        axis=1
    )
    data["CPM"] = data.progress_apply(
        lambda row: weird_division(row[drop_columns[-2]] * 100, row[drop_columns[-1]]) * 1000,
        axis=1
    )
    data = data[data["CPM"] >= 0]
    # drop unnecesary columns
    data.drop(labels=drop_columns, axis=1, inplace=True)
    return data


def freq_encoding(dataset, column):
    """Use frequency as encoding value"""
    freq = dataset.groupby(column).size() / len(dataset)
    data[column] = data[column].map(freq)
    return data


def train_test(dataset, mask, train_cut=(.025, .975)):
    """Split data on train test validate."""
    # split train test
    train = dataset[mask]
    test = dataset[~mask]
    # cut train 2.5% to 97.5% by default
    quantile = (train["CPM"].quantile(train_cut[0]), train["CPM"].quantile(train_cut[1]))
    train = train[train["CPM"].between(*quantile)]
    # cut test 0% to 95%
    test = test[test["CPM"] < test["CPM"].quantile(.95)]
    return train, test


# *Read dataset*

In [None]:
data = read_csv("/kaggle/input/real-time-advertisers-auction/Dataset.csv")
data.head()

# *Preprocess dataset (create target, drop unnecessary colummns)*

In [None]:
drop_columns = [
    "integration_type_id", "revenue_share_percent",
    "viewable_impressions",
    "total_revenue", "measurable_impressions"
]
data = prepcoress_data(data, drop_columns)
data.head()

In [None]:
print(f"Count not NaN:\n{data.notna().count()}")

# *Split data on train and test. Cut test by 95 percentile, train quantile can be configure*

In [None]:
mask = (data["date"] < "2019-06-22 00:00:00")
train, test = train_test(data, mask, train_cut=(.06, 0.94))
train.shape, test.shape

# *Compute rank correleation*

In [None]:
columns = train.columns
cat_columns = [column for column in columns if "id" in column]
cat_corr = train[cat_columns].corr(method="spearman")
figure(figsize=(14, 8))
heatmap(cat_corr, square=True, annot= True, cmap="seismic", vmin=0, vmax=1);

# *Drop columns with high correlation*

In [None]:
train.drop(labels=["site_id"], axis=1, inplace=True)
columns = train.columns
cat_columns = [column for column in columns if "id" in column]
cat_corr = train[cat_columns].corr(method="spearman")
figure(figsize=(14, 8))
heatmap(cat_corr, square=True, annot= True, cmap="seismic", vmin=0, vmax=1);

# *Apply frequency encoding to categorical columns*

In [None]:
encoder = FreqEncoder()
columns = train.columns
cat_columns = [column for column in columns if "id" in column]
train[cat_columns] = encoder.fit_transform(train[cat_columns])
test[cat_columns] = encoder.transform(test[cat_columns])
train.drop(labels=["date"], axis=1, inplace=True)
test.drop(labels=["date"], axis=1, inplace=True)

In [None]:
corr = train[cat_columns].corr(method="spearman")
figure(figsize=(14, 8))
heatmap(corr, square=True, annot= True, cmap="seismic", vmin=0, vmax=1);

# *Split train datset on split_train, split_validate*

In [None]:
columns = train.columns
split_train, split_validate = train_test_split(
    train, test_size=0.1, shuffle=True, random_state=42
)
split_x_train, split_y_train = split_train[columns[: -1]], split_train["CPM"]
split_x_validate, split_y_validate = split_validate[columns[: -1]], split_validate["CPM"]
x_test, y_test = test[columns[: -1]], test["CPM"]
print(
    f"Split x_tran: {split_x_train.shape}", f"Split y_tran: {split_y_train.shape}",
    f"Split x_validate: {split_x_validate.shape}", f"Split y_validate: {split_y_validate.shape}",
    f"Split x_test: {x_test.shape}", f"Split y_test: {y_test.shape}", sep="\n"
)

# *Train with validate dataset*

In [None]:
model = CatBoostRegressor(random_state=42, verbose=200, iterations=3000)
model.fit(
    split_x_train, split_y_train, eval_set=(split_x_validate, split_y_validate),
    use_best_model=True, early_stopping_rounds=20
)

In [None]:
print("Feature importances:")
for name, importance in zip(model.feature_names_, model.feature_importances_):
    print(f"  -{name}: {round(importance, 4)}")

# *Predict by model which was fited with validte dataset*

In [None]:
y_pred = model.predict(x_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")

# *Train without validate dataset*

In [None]:
model = CatBoostRegressor(random_state=42, verbose=200, iterations=3000)
x_train, y_train = train[columns[: -1]], train["CPM"]
model.fit(x_train, y_train)

In [None]:
print("Feature_importances")
for name, importance in zip(model.feature_names_, model.feature_importances_):
    print(f"  -{name}: {round(importance, 4)}")

# *Predict by model which was fited on full train dataset*

In [None]:
y_pred = model.predict(x_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")