In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/real-time-advertisers-auction/Dataset.csv', parse_dates=['date'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
corr = df.corr()
plt.figure(figsize=(12,8))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu", square=True)
plt.show()

In [None]:
corr = df.corr()
plt.figure(figsize=(14,10))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu",  square=True, annot=True)
plt.show()

In [None]:
def plots_hist(df, variable):
    """
    Function for plotting histogram and Q-Q graph beside, for a specific variable.
    """
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].hist()

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)

    plt.show()

In [None]:
plots_hist(df, 'total_revenue')

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['date'], df['total_revenue'])
ax.set_xlabel('date')
ax.set_ylabel('total_revenue')
plt.show()

In [None]:
print("Total income", df['total_revenue'].sum())
print("Average total income", df['total_revenue'].mean())
df['total_revenue'].value_counts()

In [None]:
#calculating CPM
#calculating the value that the Advertisers Bid for the month of June
# CPM(the value which was the winning bid value) = 
#((revenue of the publisher*100)/revenue_share_percentage)/measurable_impressions)*1000

def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = np.where(df.measurable_impressions > 0, df.total_revenue * 100 / df.measurable_impressions * 1000, 0)

In [None]:
#we can remove integration type as it has only one value and revenue share percent as that we have already used and 
#is only one single value as well and remove total_revenue and measurable_impressions because we use it for finding CPM
df.drop(['integration_type_id', 'revenue_share_percent',
         'total_impressions', 'measurable_impressions',
         'total_revenue'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['CPM'].hist()

In [None]:
df.columns

In [None]:
# delete negative value
df = df[df['CPM'] >= 0]

In [None]:
# divide data by date
X_train = df[df['date'] < "2019-06-22"]
X_test = df[df['date'] >= "2019-06-22"]

# remove outliers in data
X_train = X_train[X_train['CPM'] < X_train['CPM'].quantile(0.95)]
X_test = X_test[X_test['CPM'] < X_test['CPM'].quantile(0.95)]

In [None]:
corr = X_train.corr()
plt.figure(figsize=(14,10))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu",  square=True, annot=True)
plt.show()

In [None]:
corr = X_test.corr()
plt.figure(figsize=(14,10))
sns.heatmap(data=corr,vmin=0, vmax=1, cmap="YlGnBu",  square=True, annot=True)
plt.show()

In [None]:
# remove unnecessary columns
# for train
y_train = X_train[['CPM']]
X_train = X_train.drop(['date', 'CPM'], axis=1)
# for test
y_test = X_test[['CPM']]
X_test = X_test.drop(['date', 'CPM'], axis=1)

In [None]:
model_cat = CatBoostRegressor(iterations=1200, learning_rate=0.05, loss_function='RMSE',
                              eval_metric='RMSE', depth=10, logging_level='Silent', reg_lambda=6,
                              border_count=128, bootstrap_type='Bayesian', random_seed=123455,
                              early_stopping_rounds=100)

In [None]:
model_cat.fit(X_train, y_train, cat_features=X_train.columns)

In [None]:
# make predictions for test
print("MSE = ", mean_squared_error(y_test, model_cat.predict(X_test)))

In [None]:
y_pred = model_cat.predict(X_test)
plt.scatter(y_test, y_pred)
plt.xlabel('Y Test')
plt.ylabel('Y Predict')
# Perfect predictions
plt.plot(y_test, y_test,'r')