In [None]:
import gc
import numpy as np
import pandas as pd

import skopt
from skopt.callbacks import CheckpointSaver
import mlflow
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# Read the data

In [None]:
if __name__ == "__main__":
    train = pd.read_csv("data/train_le_reduced.csv")
    test = pd.read_csv("data/test_le_reduced.csv")
    y = pd.read_csv("data/y.csv")
    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)

# LGBM

In [None]:
lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.1)
lgbm.fit(train, y)
preds = lgbm.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("lgbm_n1000_lr01_other_thresh0.csv", index=False)

import gc
del lgbm
gc.collect()

lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.01)
lgbm.fit(train, y)
preds = lgbm.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("lgbm_n1000_lr001_other_thresh0.csv", index=False)

import gc
del lgbm
gc.collect()

lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.03)
lgbm.fit(train, y)
preds = lgbm.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("lgbm_n1000_lr003_other_thresh0.csv", index=False)

import gc
del lgbm
gc.collect()

lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.01)
lgbm.fit(train, y)
preds = lgbm.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("lgbm_n1000_lr001_other_thresh0.csv", index=False)

import gc
del lgbm
gc.collect()

lgbm = LGBMClassifier(n_estimators=2000, learning_rate=0.1)
lgbm.fit(train, y)
preds = lgbm.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("lgbm_n2000_lr01_other_thresh0.csv", index=False)

import gc
del lgbm
gc.collect()

lgbm = LGBMClassifier(n_estimators=2000, learning_rate=0.03)
lgbm.fit(train, y)
preds = lgbm.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("lgbm_n2000_lr003_other_thresh0.csv", index=False)

import gc
del lgbm
gc.collect()

lgbm = LGBMClassifier(n_estimators=2000, learning_rate=0.01)
lgbm.fit(train, y)
preds = lgbm.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("lgbm_n2000_lr001_other_thresh0.csv", index=False)

import gc
del lgbm
gc.collect()

# XGBoost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=1000, tree_method="hist", learning_rate=0.1)
xgb.fit(train, y)
preds = xgb.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("xgb_n1000_lr01_other_thresh0.csv", index=False)

import gc
del xgb
gc.collect()

xgb = XGBClassifier(n_estimators=1000, tree_method="hist", learning_rate=0.03)
xgb.fit(train, y)
preds = xgb.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("xgb_n1000_lr003_other_thresh0.csv", index=False)

import gc
del xgb
gc.collect()

xgb = XGBClassifier(n_estimators=1000, tree_method="hist", learning_rate=0.01)
xgb.fit(train, y)
preds = xgb.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("xgb_n1000_lr001_other_thresh0.csv", index=False)

import gc
del xgb
gc.collect()

xgb = XGBClassifier(n_estimators=2000, tree_method="hist", learning_rate=0.1)
xgb.fit(train, y)
preds = xgb.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("xgb_n2000_lr01_other_thresh0.csv", index=False)

import gc
del xgb
gc.collect()

xgb = XGBClassifier(n_estimators=2000, tree_method="hist", learning_rate=0.03)
xgb.fit(train, y)
preds = xgb.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("xgb_n2000_lr003_other_thresh0.csv", index=False)

import gc
del xgb
gc.collect()

xgb = XGBClassifier(n_estimators=2000, tree_method="hist", learning_rate=0.01)
xgb.fit(train, y)
preds = xgb.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("xgb_n2000_lr001_other_thresh0.csv", index=False)

import gc
del xgb
gc.collect()

# Catboost

In [None]:
if __name__ == "__main__":
    train = pd.read_csv("data/train_le_reduced.csv")
    test = pd.read_csv("data/test_le_reduced.csv")
    y = pd.read_csv("data/y.csv")
    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)

# Threshold for train variables
# C1: 0
# Site_id: 10000
# Site domain: 10000
# Site_category: 10000
# app_id: 10000
# app_domain: 10000
# app_category: 10000
# device_id: 5000
# device_ip: 5000
# device_model: 5000
# device_type: 0
# device_conn_type: 0
# C14: 10000
# C15: 0
# C16: 0
# C17: 10000
# C18: 0
# C19: 10000
# C20: 10000
# C21: 0
thresh_10000_vars = ["site_id", "site_domain", "site_category", "app_id", "app_domain", "app_category", "C14", "C17", "C19", "C20"]
thresh_5000_vars = ["device_id", "device_ip", "device_model", "device_type", "device_conn_type"]
thresh = 0
for var in thresh_10000_vars:
    value_counts = train[var].value_counts()
    mask = train[var].isin(value_counts.index[value_counts<100])
    train.loc[mask, var] = -1

for var in thresh_5000_vars:
    value_counts = train[var].value_counts()
    mask = train[var].isin(value_counts.index[value_counts<100])
    train.loc[mask, var] = -1

# Threshold for test variables
# C1: 0
# Site_id: 10000
# Site domain: 10000
# Site_category: 10000
# app_id: 10000
# app_domain: 10000
# app_category: 10000
# device_id: 5000
# device_ip: 5000
# device_model: 5000
# device_type: 0
# device_conn_type: 0
# C14: 10000
# C15: 0
# C16: 0
# C17: 10000
# C18: 0
# C19: 10000
# C20: 10000
# C21: 0
thresh_10000_vars = ["site_id", "site_domain", "site_category", "app_id", "app_domain", "app_category", "C14", "C17", "C19", "C20"]
thresh_5000_vars = ["device_id", "device_ip", "device_model", "device_type", "device_conn_type"]
thresh = 0
for var in thresh_10000_vars:
    value_counts = test[var].value_counts()
    mask = test[var].isin(value_counts.index[value_counts<100])
    test.loc[mask, var] = -1

for var in thresh_5000_vars:
    value_counts = test[var].value_counts()
    mask = test[var].isin(value_counts.index[value_counts<100])
    test.loc[mask, var] = -1

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=1000, learning_rate=0.3)
cat.fit(train, y)
preds = cat.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("cat_n1000_lr03_other_thresh100.csv", index=False)

import gc
del cat
gc.collect()

cat = CatBoostClassifier(iterations=1000, learning_rate=0.1)
cat.fit(train, y)
preds = cat.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("cat_n1000_lr01_other_thresh100.csv", index=False)

import gc
del cat
gc.collect()

cat = CatBoostClassifier(iterations=2000, learning_rate=0.3)
cat.fit(train, y)
preds = cat.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("cat_n2000_lr03_other_thresh100.csv", index=False)

import gc
del cat
gc.collect()

cat = CatBoostClassifier(iterations=2000, learning_rate=0.1)
cat.fit(train, y)
preds = cat.predict_proba(test)
sub = pd.read_csv("data/sampleSubmission.csv")
sub["click"] = 1 - preds
sub[["id", "click"]].to_csv("cat_n2000_lr01_other_thresh100.csv", index=False)

import gc
del cat
gc.collect()

# Abhishek's method
Taken from: https://www.kaggle.com/competitions/avazu-ctr-prediction/discussion/10927

In [None]:
from datetime import datetime
from math import log, exp, sqrt


# TL; DR
# the main learning process start at line 122


# parameters #################################################################

train = 'data/train.csv'  # path to training file
test = 'data/test.csv'  # path to testing file

D = 2 ** 20  # number of weights use for each model, we have 32 of them
alpha = .1   # learning rate for sgd optimization


# function, generator definitions ############################################

# A. x, y generator
# INPUT:
#     path: path to train.csv or test.csv
#     label_path: (optional) path to trainLabels.csv
# YIELDS:
#     ID: id of the instance (can also acts as instance count)
#     x: a list of indices that its value is 1
#     y: (if label_path is present) label value of y1 to y33
def data(path, traindata=False):
    for t, line in enumerate(open(path)):
        # initialize our generator
        if t == 0:
            # create a static x,
            # so we don't have to construct a new x for every instance
            x = [0] * 27
            continue
        # parse x
        for m, feat in enumerate(line.rstrip().split(',')):
            if m == 0:
                ID = int(feat)
            elif traindata and m == 1:
                y = [float(feat)]
            else:
                # one-hot encode everything with hash trick
                # categorical: one-hotted
                # boolean: ONE-HOTTED
                # numerical: ONE-HOTTED!
                # note, the build in hash(), although fast is not stable,
                #       i.e., same value won't always have the same hash
                #       on different machines
                if traindata:
                    x[m] = abs(hash(str(m) + '_' + feat)) % D
                else:
                    x[m+1] = abs(hash(str(m+1) + '_' + feat)) % D

        yield (ID, x, y) if traindata else (ID, x)

# B. Bounded logloss
# INPUT:
#     p: our prediction
#     y: real answer
# OUTPUT
#     bounded logarithmic loss of p given y
def logloss(p, y):
    p = max(min(p, 1. - 10e-15), 10e-15)
    return -log(p) if y == 1. else -log(1. - p)


# C. Get probability estimation on x
# INPUT:
#     x: features
#     w: weights
# OUTPUT:
#     probability of p(y = 1 | x; w)
def predict(x, w):
    wTx = 0.
    for i in x:  # do wTx
        wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
    return 1. / (1. + exp(-max(min(wTx, 20.), -20.)))  # bounded sigmoid


# D. Update given model
# INPUT:
# alpha: learning rate
#     w: weights
#     n: sum of previous absolute gradients for a given feature
#        this is used for adaptive learning rate
#     x: feature, a list of indices
#     p: prediction of our model
#     y: answer
# MODIFIES:
#     w: weights
#     n: sum of past absolute gradients
def update(alpha, w, n, x, p, y):
    for i in x:
        # alpha / sqrt(n) is the adaptive learning rate
        # (p - y) * x[i] is the current gradient
        # note that in our case, if i in x then x[i] = 1.
        n[i] += abs(p - y)
        w[i] -= (p - y) * 1. * alpha / sqrt(n[i])


# training and testing #######################################################
start = datetime.now()

K = [0]

w = [[0.] * D]
n = [[0.] * D]

loss = 0.

tt = 1
for ID, x, y in data(train, traindata = True):

    # get predictions and train on all labels
    for k in K:
        p = predict(x, w[k])
        update(alpha, w[k], n[k], x, p, y[k])
        loss += logloss(p, y[k])  # for progressive validation

    # print out progress, so that we know everything is working
    if tt % 100000 == 0:
        print('%s\tencountered: %d\tcurrent logloss: %f' % (
                datetime.now(), tt, (loss * 1./tt)))
    tt += 1

with open('submission.csv', 'w') as outfile:
    outfile.write('id,click\n')
    for ID, x in data(test):
        for k in K:
            p = predict(x, w[k])
            outfile.write('%s,%s\n' % (ID, str(p)))

print('Done, elapsed time: %s' % str(datetime.now() - start))


# Boosters ensemble

In [None]:
lgbm = pd.read_csv("lgbm_n1000_other_thresh0.csv")
xgb = pd.read_csv("xgb_n1000_lr01_other_thresh0.csv")
cat = pd.read_csv("xgb_n2000_lr01_other_thresh0.csv")

In [None]:
lgbm_thresh = 0.333333333
xgb_thresh = 0.333333333
cat_thresh = 0.333333333
ensemble = lgbm.copy()
ensemble["click"] = lgbm["click"] * lgbm_thresh + xgb["click"] * xgb_thresh + cat["click"] * cat_thresh
ensemble.to_csv("ensemble-lgbm033_xgb033_cat033.csv", index=False)

In [2]:
booster_ensemble = pd.read_csv("ensemble-lgbm033_xgb033_cat033.csv")
abhi = pd.read_csv("submission.csv")
booster_ensemble_thresh = 0.5
abhi_thresh = 0.5
ensemble = booster_ensemble.copy()
ensemble["click"] = booster_ensemble["click"] * booster_ensemble_thresh + abhi["click"] * abhi_thresh
ensemble.to_csv("ensemble-boosterensemble03_abhi07.csv", index=False)