## Import Libraries and Data

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

files=[]

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))
        print(files[-1])

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv(files[-1])
df["date"] = pd.to_datetime(df["date"]).dt.date
df["day_from_date"] = pd.to_datetime(df["date"]).dt.day
df['month_from_date'] = pd.to_datetime(df["date"]).dt.month
print(df.shape)
df.head(1)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
%%time
df["CPM"] = np.where(df["measurable_impressions"] != 0, 
                    (df["total_revenue"] * 100) / (df['measurable_impressions']) * 1000,
                     0,
                    )
sns.distplot(df["CPM"])

In [None]:
sns.heatmap(df.drop("CPM", axis=1).corr())

In [None]:
## drop high correlate features
corr_matrix = df.drop("CPM", axis=1).corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print("before remove:", df.shape)
df.drop(to_drop, axis=1, inplace=True)
print("after remove:", df.shape)

In [None]:
df.info()

## Features

In [None]:
df[["date", "CPM"]].groupby(["date"]).agg({"CPM":"mean"}).unstack().plot.bar()

In [None]:
df[["site_id", "CPM"]].groupby(["site_id"]).agg({"CPM":"mean"}).unstack().plot.bar()

In [None]:
df[["ad_type_id", "CPM"]].groupby(["ad_type_id"]).agg({"CPM":"median"}).unstack().plot.bar()

In [None]:
df[["device_category_id", "CPM"]].groupby(["device_category_id"]).agg({"CPM":"mean"}).unstack().plot.bar()

In [None]:
df[["os_id", "CPM"]].groupby(["os_id"]).agg({"CPM":"mean"}).unstack().plot.bar()

In [None]:
df[["monetization_channel_id", "CPM"]].groupby(["monetization_channel_id"]).agg({"CPM":"mean"}).unstack().plot.bar()

In [None]:
df[["day_from_date", "CPM"]].groupby(["day_from_date"]).agg({"CPM":"mean"}).unstack().plot.bar()

## Preprocessing

In [None]:
df.nunique(axis=0)

In [None]:
df.drop(["integration_type_id", "revenue_share_percent", "total_revenue", "ad_type_id"], axis=1, inplace=True)

In [None]:
cpm_quan_95 = df["CPM"].quantile(0.95)
df = df[(df['CPM'] >= 0 ) & (df['CPM'] < cpm_quan_95)]

In [None]:
mid_date = dt.date(2019, 6, 22)
train = df.loc[df["date"] < mid_date, :]
test = df.loc[df["date"] >= mid_date, :]

train.drop("date", axis=1, inplace=True)
test.drop("date", axis=1, inplace=True)

print(train.shape, test.shape)

## Modeling

In [None]:
X_train = train.drop("CPM", axis=1)
y_train = train["CPM"]

In [None]:
X_test = test.drop("CPM", axis=1)
y_test = test["CPM"]

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'max_depth': 10, 
    'learning_rate': 0.1,
    'verbose': 0, 
    'early_stopping_round': 50}

n_estimators = 1000
n_iters = 10
mse_errors = []

for i in range(n_iters): 
    X_train = train.drop("CPM", axis=1)
    y_train = train["CPM"]
    x_train, x_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, random_state=i)
    d_train = lgb.Dataset(x_train, label=y_train)
    d_valid = lgb.Dataset(x_valid, label=y_valid)
    watchlist = [d_valid]
    model = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=0)

    preds = model.predict(x_valid)
    err = mean_squared_error(y_valid, preds)
    mse_errors.append(err)
    print('MSE = ' + str(err))

print(f"Mean MSE = {np.mean(mse_errors)} +/- {np.std(mse_errors)}")

## Answer

In [None]:
pred = model.predict(X_test)
print("MSE: ", round(mean_squared_error(pred, y_test), 4))