In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

from xgboost import XGBRegressor, XGBRFRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/'.join([str(dirname),str(filename)]))

In [None]:
df.describe()

In [None]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(x['total_revenue']*100, x['measurable_impressions'])*1000 , axis=1)

In [None]:
df = df.drop(columns=['ad_type_id', 'revenue_share_percent', 'integration_type_id'])

In [None]:
df = df.loc[((df.CPM < df.CPM.quantile(0.95))&(df.CPM >= 0))]

In [None]:
plt.figure(figsize=(16,16))
sns.heatmap(df.corr(), cmap='RdYlGn',  square=True, annot= True)

In [None]:
X_train = df.loc[df.date<='2019-06-21'].reset_index(drop=True)
X_test = df.loc[df.date>'2019-06-21'].reset_index(drop=True)

In [None]:
clmns = {'site_id': 'site',
        'geo_id': 'geo',
        'device_category_id': 'device',
        'advertiser_id': 'advertist',
        'os_id': 'OS',
        'line_item_type_id':'line',
        'monetization_channel_id': 'monetization', 
        'ad_unit_id': 'unit',
        'total_impressions':'TI', 
        'viewable_impressions': 'VI', }
X_train = X_train.rename(columns=clmns)
X_test = X_test.rename(columns=clmns)

In [None]:
y_train = X_train['CPM']
y_train_tr = X_train['total_revenue']
y_train_mi = X_train['measurable_impressions']
X_train = X_train.drop(columns=['CPM', 'total_revenue', 'measurable_impressions']).reset_index(drop=True)
y_test = X_test['CPM']
y_test_tr = X_test['total_revenue']
y_test_mi = X_test['measurable_impressions']
X_test = X_test.drop(columns=['CPM', 'total_revenue', 'measurable_impressions']).reset_index(drop=True)

In [None]:
X_train = X_train.drop(columns=['date'])
X_test = X_test.drop(columns=['date'])

In [None]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
predict_rf = rf.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_rf)

In [None]:
xgbr = XGBRegressor(random_state=42, n_jobs=-1)
xgbr.fit(X_train, y_train)
predict_xgbr = xgbr.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_xgbr)

In [None]:
xgbrf = XGBRFRegressor(random_state=42, n_jobs=-1)
xgbrf.fit(X_train, y_train)
predict_xgbrf = xgbrf.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_xgbrf)

In [None]:
lgbmr = LGBMRegressor(n_jobs=-1, random_state=42)
lgbmr.fit(X_train, y_train)
predict_lgbmr = lgbmr.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_lgbmr)

In [None]:
%%time
xgbr = XGBRegressor(learning_rate=0.1, max_depth=6, n_estimators=1000, random_state=42, silent=True, n_jobs=-1)
xgbr.fit(X_train[50000:], y_train[50000:], eval_set=[(X_train[:50000], y_train[:50000])], eval_metric='rmse', early_stopping_rounds=50, verbose=False)
predict_xgbr = xgbr.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_xgbr)

In [None]:
mean_squared_error(y_train, xgbr.predict(X_train))

In [None]:
print(xgbr.best_score**2)
print(xgbr.best_iteration)
print(xgbr.best_ntree_limit)

In [None]:
pd.Series(xgbr.feature_importances_, index=X_train.columns).sort_values()

In [None]:
%%time
shap_test = shap.TreeExplainer(xgbr, algorithm='gradient')

In [None]:
%%time
shap_test = shap_test.shap_values(X_train[:25000], tree_limit=700, check_additivity = False)

In [None]:
%%time
shap.summary_plot(shap_test, X_train[:25000],
                       max_display=25, plot_size=(36, 16), axis_color='white')

In [None]:
X_train.hist(figsize=(24,16), bins=25)

In [None]:
X_train.line.value_counts()

In [None]:
X_test.line.value_counts()

In [None]:
%%time
lgbr = LGBMRegressor(learning_rate=0.1, max_depth=13, n_estimators=1000, random_state=42, n_jobs=-1)
lgbr.fit(X_train, y_train)
predict_lgbr = lgbr.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_lgbr)

In [None]:
mean_squared_error(y_train, lgbr.predict(X_train))

In [None]:
pd.Series(lgbr.feature_importances_, index=X_train.columns).sort_values()

In [None]:
%%time
shap_test = shap.TreeExplainer(lgbr, algorithm='gradient')

In [None]:
%%time
shap_test = shap_test.shap_values(X_train[:25000], tree_limit=700, check_additivity = False)

In [None]:
%%time
shap.summary_plot(shap_test, X_train[:25000],
                       max_display=25, plot_size=(36, 16), axis_color='white')

In [None]:
%%time
lgbr_tr = LGBMRegressor(learning_rate=0.1, max_depth=13, n_estimators=1000, random_state=42, n_jobs=-1)
lgbr_tr.fit(X_train, y_train_tr)
predict_lgbr_tr = lgbr_tr.predict(X_test)

In [None]:
mean_squared_error(y_test_tr, predict_lgbr_tr)

In [None]:
mean_squared_error(y_train_tr, lgbr_tr.predict(X_train))

In [None]:
%%time
lgbr_mi = LGBMRegressor(learning_rate=0.1, max_depth=13, n_estimators=1000, random_state=42, n_jobs=-1)
lgbr_mi.fit(X_train, y_train_mi)
predict_lgbr_mi = np.vectorize(lambda x: int(x))(lgbr_mi.predict(X_test))

In [None]:
mean_squared_error(y_test_mi, predict_lgbr_mi)

In [None]:
mean_squared_error(y_train_mi, lgbr_mi.predict(X_train))

In [None]:
recovered_cpm = [weird_division(predict_lgbr_tr[i]*100, predict_lgbr_mi[i])*1000 for i in range(predict_lgbr_tr.shape[0])]

In [None]:
mean_squared_error(y_test, recovered_cpm)

In [None]:
X_train['recoveret_tr'] = lgbr_tr.predict(X_train)
X_test['recoveret_tr'] = lgbr_tr.predict(X_test)

In [None]:
%%time
lgbr = LGBMRegressor(learning_rate=0.1, max_depth=13, n_estimators=1000, random_state=42, n_jobs=-1)
lgbr.fit(X_train, y_train)
predict_lgbr = lgbr.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_lgbr)

In [None]:
pd.Series(lgbr.feature_importances_, index=X_train.columns).sort_values()

In [None]:
X_train['recoveret_mi'] = np.vectorize(lambda x: int(x))(lgbr_mi.predict(X_train.iloc[:,:-1]))
X_test['recoveret_mi'] = np.vectorize(lambda x: int(x))(lgbr_mi.predict(X_test.iloc[:,:-1]))

In [None]:
%%time
lgbr = LGBMRegressor(learning_rate=0.1, max_depth=13, n_estimators=1000, random_state=42, n_jobs=-1)
lgbr.fit(X_train, y_train)
predict_lgbr = lgbr.predict(X_test)

In [None]:
mean_squared_error(y_test, predict_lgbr)

In [None]:
pd.Series(lgbr.feature_importances_, index=X_train.columns).sort_values()

## Best score MSE = 2608 by sumple LGBMRegressor