In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# , cross_val_score
from lightgbm import LGBMRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv(os.path.join(dirname, filename))

In [None]:
print("Initial dataframe len = ",len(df))
df.head()

In [None]:
#calculating CPM
def weird_division(n, d):
    return n / d if d else 0

In [None]:
df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [None]:
df.head()

In [None]:
df.hist(column="CPM", grid=True)

In [None]:
df.CPM[df.CPM >= 50000]

### Filter values

In [None]:
df = df[df.CPM >= 0].reset_index(drop=True)
# df.drop(['total_revenue'], axis = 1, inplace=True)
df = df.loc[df['CPM'] < df['CPM'].quantile(.95)].reset_index(drop=True)
print("Dataframe len after deleting negative CMP vals and 0.95 cut-off = ",len(df))

In [None]:
df.hist(column="CPM", grid=True)

### Select features

In [None]:
import datetime
def convert_date_to_feature(df):
    df['date'] = pd.to_datetime(df['date'])
    df['date'] = df['date'].dt.strftime('%Y-%m-%d')
    df['day'] = pd.DatetimeIndex(df['date']).day
    df['weekday'] = pd.DatetimeIndex(df['date']).weekday
    return df

In [None]:
df = convert_date_to_feature(df)

In [None]:
df.columns

In [None]:
for col in df.columns:
    print(col)
    print("   null vals: ",df[col].isnull().values.sum(),"; unique vals:", len(df[col].unique()),)
    print()

In [None]:
# integration_type, revenue_share_percent -- only 1 val

In [None]:
# selected features
cat_cols = ['geo_id', 'monetization_channel_id','day','weekday','site_id','device_category_id','advertiser_id','ad_unit_id', 'os_id', 'line_item_type_id','order_id',]
num_cols = ['CPM','total_impressions', 'viewable_impressions',]
features = cat_cols + num_cols

In [None]:
for col in cat_cols:
    df[col] = df[col].astype('category')

### Train model

In [None]:
split_date ='2019-06-22 00:00:00'
train_df = df.loc[df['date'] < split_date].reset_index(drop=True)
test_df = df.loc[df['date'] >= split_date].reset_index(drop=True)

In [None]:
print("Train df len = ",len(train_df))
train_df.tail(1)

In [None]:
print("Test df len = ",len(test_df))
test_df.head(1)

In [None]:
train_df = train_df[features]
test_df = test_df[features]

In [None]:
def train_and_test(transformed_df, model, cat_cols, test_size=0.2):
    target = transformed_df['CPM']
    features = transformed_df.copy().drop('CPM', axis=1)

    X_train, X_val, y_train, y_val = train_test_split(
        features, target, test_size=test_size, shuffle = False, random_state=42)

    model.fit(X_train, y_train, categorical_feature=cat_cols)  
    y_pred = model.predict(X_val) 
    print('MSE val: %.3f' % (mean_squared_error(y_val, y_pred)))
    plt.scatter(y_val, y_pred)
    plt.title('Predicted vs. Actual CMP', fontsize=18, fontweight='bold')
    plt.xlabel('Actual CMP')
    plt.ylabel('Predicted CMP')
    plt.show()

    return model

In [None]:
params ={
    "n_estimators": 600,
    "max_depth": 6,
    "learning_rate":0.1,
    "criterion": "mse",
}

In [None]:
model = LGBMRegressor(**params) 
trained_model = train_and_test(train_df, model, cat_cols)

### MSE on test data

In [None]:
y_pred = trained_model.predict(test_df.drop('CPM', axis=1))
pred = abs(np.round(y_pred,0))
actual = np.array(test_df['CPM'])
mean_squared_error(actual, pred)