In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import category_encoders as ce
from sklearn.metrics import log_loss
from skopt import BayesSearchCV
from sklearn.model_selection import GridSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer, auc, log_loss, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform
from sklearn.svm import l1_min_c


from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('train.gz', compression='gzip', header='infer')

# 1. Explore data

In [None]:
#target

print("Imbalance ratio: {}".format(float(len(df[df['click']==0]))/len(df[df['click']==1])))
print("Click-through rate is {}%".format(100.0*df['click'].sum()/len(df)))

In [None]:
#see levels for each categorical feature

for column in df:
    print(f"{column}: {df[str(column)].unique()}; number of levels: {df[str(column)].nunique()}")
    

# 2. Clean data

In [None]:
#drop categories that have only one level (same for all observations)

df.drop(['id','app_id', 'app_domain', 'app_category'], axis=1, inplace=True)

# 3. Obtain training and test sets via time split

In [None]:
test = df.query("hour == [16,17,18,19,20,21,22,23] and day_of_week == 1 or hour == [0,1] and day_of_week == 2")


In [None]:
train = df.iloc[:test.index[0]]

# 4. Feature Engineering

## 4.1. consumer

In [None]:
def get_consumer(row):
    if str(row['device_id']) != "a99f214a":
        return row['device_id']
    else:
        device_ip = row["device_ip"]
        device_model = row["device_model"]
        return str(device_ip) + str(device_model)
    
train['consumer'] = train.apply(lambda row: get_user(row), axis=1)

## 4.2. click_history
This tracks the click history made by the same user up to the current point in time, depicted as a string of previous click history.

In [None]:
df['click'] = df['click'].astype('str')

for user in tqdm(df['user'].unique()):
    subset = df[df['user'].isin([user])]
    click_string = ''.join(subset['click'])
    
    subset['click_history'] = [click_string[:i-1] for i in range(1, len(click_string)+1)]
    
    train.loc[subset.index, 'click_history'] = subset['click_history']

In [None]:
def convert_empty_string(value):
    if value == '':
        return 'first string'
    else:
        return value

train['click_history_converted'] = train['click_history'].apply(lambda x: convert_empty_string(x))

## 4.3. hour_of_day, day_of_week

In [None]:
train['hour'] = train['hour'].map(lambda x: datetime.strptime(str(x),"%y%m%d%H"))
train['day_of_week'] = train['hour'].map(lambda x: x.weekday()) # 1: tues, 2: wed
train['hour'] = train['hour'].map(lambda x: x.hour)

## 4.4. count features

In [None]:
train["device_ip_count"] = np.where(train["device_ip_count"] == 1, device_ip_df.loc['Unknown'][0], train["device_ip_count"])
train["device_id_count"] = np.where(train["device_id_count"] == 1, device_id_df.loc['Unknown'][0], train["device_id_count"])


In [None]:
def get_device_ip_count(device_ip):
    return device_ip_df.loc[device_ip][0]
    
train['device_ip_count'] = train['device_ip'].apply(lambda x: get_device_ip_count(x))

In [None]:
def get_device_id_count(device_id):
    return device_id_df.loc[device_id][0]
    
train['device_id_count'] = train['device_id'].apply(lambda x: get_device_id_count(x))

In [None]:
def get_hour_count(hour):
    return hourly_impression_df.loc[hour][0]
    
train['hour_count'] = train['hour'].apply(lambda x: get_hour_count(x))

In [None]:
hourly_consumer_df = pd.DataFrame(train.groupby(["hour", "consumer"]).size().unstack())

def get_hourly_consumer_count(row):
    hour = row['hour']
    consumer = row['consumer']
    
    desired = hourly_consumer_df.loc[hour, consumer]
    
    if desired == np.nan:
        return 0
    else:
        return desired
    
train['hourly_consumer_count'] = train.apply(lambda row: get_hourly_consumer_count(row), axis=1)

# 5. Feature cleaning of rare features
For catgorical features, we remove rare feature values, defined as levels of the feature that only appear once. We group such levels together into a "Rare" category.

In [None]:
def clean_rare_features(col):
    col_counts = pd.DataFrame(train.groupby(col).size())
    col_counts[0] = np.where(col_counts[0] == 1, "Rare", col_counts[0])
    rare_rows = col_counts[col_counts[0] == "Rare"].index
    train[col] = np.where(train[col].isin(rare_rows), "Rare", train[col])

for col in ['site_id', 'site_domain', 'device_id', 'device_ip', 'device_model']:
    clean_rare_features(col)


# 6. Encode Categorical Features

## 6.1. Hash Encoding: For features with high cardinality

In [None]:
encoder = ce.HashingEncoder()

def hash_encode(self):
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map(lambda x: encoder.fit_transform(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self

train = hash_encode(train)



# 7. Train & tune CatBoost

In [None]:
X_train = train.drop(["click"],axis=1)
y_train = train["click"]

In [None]:
categorical_f = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

In [None]:
catboost = CatBoostClassifier(loss_function='Logloss', cat_features=categorical_f, data_partition='FeatureParallel', 
                              bootstrap_type='Bernoulli', verbose=10)

param = {
    'iterations':Integer(100, 250), # on the low side to speed up computation (learning rate will adjust accordingly)
    'depth':Integer(1, 10),
    'random_strength':Real(1e-9, 10), # amount of randomness to use for scoring splits (used to prevent overfitting)
    #'bagging_temperature':Real(0.0, 1.0),
    'l2_leaf_reg':Real(0.001, 10000), # coefficient at the L2 regularization term (lambda)
    'scale_pos_weight':Real(1, 50), # weight for class 1 in binary classification
    'subsample':Real(0.5, 1),
    'colsample_bylevel':Real(0.5,1),
    'model_size_reg':Real(0.01, 1000), # model size regularization coefficient
    'leaf_estimation_iterations':[1,5] # how many steps are done in every tree when calculating leaf values (values recommendated in documentation)
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True) 
    
opt = BayesSearchCV(catboost, param, scoring = LogLoss, n_iter=16, cv=3, random_state=_, verbose=1)

# executes bayesian optimization
opt.fit(X_train, y_train)

In [None]:
opt.best_params_

In [None]:
catboost_pred = opt.predict_proba(test)

# 8. Train & tune LightGBM

In [None]:
lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_logloss')

param = {
    'max_depth': Integer(3, 10),
    'learning_rate': Real(0.01, 0.3),
    'feature_fraction': Real(0.2, 0.9, 'uniform'),
    'bagging_fraction': Real(0.2, 0.9),
    'max_bin': Integer(20, 255, 'uniform'),
    'n_estimators': Integer(100, 1000, 'uniform'),
    'num_leaves': Integer(24, 80, 'uniform'),
    'min_sum_hessian_in_leaf':Integer(0,100, 'uniform'),
    'min_data_in_leaf': Integer(20, 100, 'uniform'),
    'min_split_gain': Real(0.001, 0.1),
    'lambda_l1': Real(1e-8, 10.0),
    'lambda_l2': Real(1e-8, 10.0),
    'bagging_freq': Integer(1,7, 'uniform')
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# input your random_state
opt = BayesSearchCV(
    lgb_model,
    param,
    scoring = LogLoss,
    n_iter=32,
    cv=5,
    random_state=0
)

opt.fit(X_train, y_train)


In [None]:
opt.best_params_

In [None]:
lightgbm_pred = opt.predict_proba(test)

# 8. Ensemble models by Stacking, with Elastic Net Logistic Regression as meta-model

In [None]:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

cs = l1_min_c(prob_df, y_test, loss='log') * np.logspace(0, 10, 20)
clf = LogisticRegression(penalty='l1', solver='liblinear')

param_grid = {'C': list(cs)}
grid = GridSearchCV(clf, param_grid, verbose=10,cv=3, scoring=LogLoss)
grid.fit(prob_df, y_test)

In [None]:
grid.best_estimator_

In [None]:
clf_opt = LogisticRegression(C=0.7288911003040416, penalty='l1', solver='liblinear')
clf_opt.fit(prob_df, y_test)

In [None]:
wts = clf_opt.coef_[0] / clf_opt.coef_[0].sum()

In [None]:
final_proba = np.zeros((len(prob_df), ))
final_proba+=proba_lgb_li[0] * wts[0]
final_proba+=proba_lgb_li_8[0] * wts[1]
final_proba+=proba_lgb_li_28[0] * wts[2]
final_proba+=proba_lgb_li_4812[0] * wts[3]
final_proba+=proba_lgb_li_19[0] * wts[4]
final_proba+=proba_lgb_li_48128 * wts[5]
final_proba+=proba_xgb_li[0] * wts[6]
final_proba+=proba_cat_li_2[0] * wts[7]
final_proba+=proba_cat_li_240[0] * wts[8]
final_proba+=proba_cat_li_101[0] * wts[9]
final_proba+=proba_cat_li_16[0] * wts[10]
final_proba+=proba_cat_li_24[0] * wts[11]
log_loss(y_test, final_proba)