In [None]:
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
import gc
import os
import time
import matplotlib.pyplot as plt

In [None]:
def log(content, *args):
    tag = 'TRAINING' if TRAIN else 'PREDICTING'
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), tag, ': ', content, *args)

In [None]:
def load_data(data_type):
    log('#1 load data')

    dtypes = {
        'ip': 'category',
        'app': 'category',
        'device': 'category',
        'os': 'category',
        'channel': 'category',
        'is_attributed': 'uint8'
    }
    if data_type == 'train':
        # 1 8000 0000多万条数据，要取最后一天的数据,,
        df = pd.read_csv('../input/train.csv', parse_dates=['click_time', 'attributed_time'], dtype=dtypes,
                           skiprows=range(1, 131886953), nrows=1000000)

    elif data_type == 'val1':
        df = pd.read_csv('../input/train.csv', parse_dates=['click_time', 'attributed_time'], dtype=dtypes,
                          nrows=1000000)

    elif data_type == 'val2':
        df = pd.read_csv('../input/train.csv', parse_dates=['click_time', 'attributed_time'], dtype=dtypes,
                         skiprows=range(1, 80000000), nrows=1000000)

    elif data_type == 'test':
        df = pd.read_csv('../input/test.csv', parse_dates=['click_time'], dtype=dtypes)
        df.drop(columns='click_id', inplace=True)

    return df

In [None]:

def feature_eng(df):
    log('#2 feature_eng')

    # assume ip device and os could be regcognized as a user
    df['ip_device_os'] = df['ip'].str.cat(df['device'], sep=' ').str.cat(df['os'], sep=' ')

    # click count per ip device os
    click_count = df[['ip_device_os', 'click_time']].groupby('ip_device_os').click_time.count()
    df['click_count'] = df['ip_device_os'].map(click_count)

    # channel count per ip device os
    channel_count = df[['ip_device_os', 'channel']].groupby('ip_device_os')['channel'].nunique()
    df['channel_count'] = df['ip_device_os'].map(channel_count)

    # app count per ip device os
    app_count = df[['ip_device_os', 'app']].groupby('ip_device_os')['app'].nunique()
    df['app_count'] = df['ip_device_os'].map(app_count)

    # add hour into data set
    df['click_hour'] = df['click_time'].dt.hour

    hour_count = df[['ip_device_os', 'click_hour']].groupby('ip_device_os').click_hour.nunique()
    df['hour_count'] = df['ip_device_os'].map(hour_count)

    # more features
    # 1. 对于某ip device os设备，每天每个小时内，点击数量
    # 2. 对于某ip device os设备, 针对某个app的点击数量
    # 3. 对于某ip device os设备, 针对某channel的点击数量
    # 4. 对于某ip device os设备， 针对每app，每chanel的点击数量

    # 1. 某设备，某个hour内的点击次数
    df['ip_device_os_hour'] = df['ip_device_os'].str.cat(df['click_hour'].map(str), sep=' ')
    click_count_in_hour = df[['ip_device_os_hour', 'click_time']].groupby('ip_device_os_hour').click_time.count()
    df['click_count_in_hour'] = df['ip_device_os_hour'].map(click_count_in_hour)
    df.drop(columns='ip_device_os_hour', inplace=True)

    # 2. 某app
    df['ip_device_os_app'] = df['ip_device_os'].str.cat(df['app'].map(str), sep=' ')
    click_count_for_app = df[['ip_device_os_app', 'click_time']].groupby('ip_device_os_app').click_time.count()
    df['click_count_for_app'] = df['ip_device_os_app'].map(click_count_for_app)
    df.drop(columns='ip_device_os_app', inplace=True)

    # # 3. 某channel, 经检测，效果不理想，没有提升。
    # df['ip_device_os_channel'] = df['ip_device_os'].str.cat(df['channel'].map(str), sep=' ')
    # click_count_for_channel = df[['ip_device_os_channel', 'click_time']].groupby(
    #     'ip_device_os_channel').click_time.count()
    # df['click_count_for_channel'] = df['ip_device_os_channel'].map(click_count_for_channel)
    # df.drop(columns='ip_device_os_channel', inplace=True)

    # 5. per ip device os under a certain hour, click count for an app.

    del click_count, channel_count, app_count, click_count_in_hour, click_count_for_app#, click_count_for_channel
    gc.collect()

    return df

In [None]:
def feature_selection(df):
    log('#3 feature selection')

    drops = ['ip_device_os', 'ip', 'device', 'os', 'click_time']
    if TRAIN:
        drops.append( 'attributed_time')

    df.drop(columns=drops, inplace=True)
    return df

In [None]:
def feature_wrangle(df):
    log('#4 feature wrangle')

    if TRAIN:
        df['is_attributed'] = df['is_attributed'].fillna(0).astype('uint8')

    df.channel_count = df.channel_count.astype('uint8')
    df.app_count = df.app_count.astype('uint8')
    df.click_count = df.click_count.astype('uint16')
    df.app = df.app.astype('uint16')
    df.channel = df.channel.astype('uint16')
    df.hour_count = df.hour_count.astype('uint8')
    df.click_hour = df.click_hour.astype('uint8')
    return df

In [None]:
def get_data(data_type):
    log('GET DATA BY ', data_type)

    # 1. load data
    df = load_data(data_type)
    gc.collect()

    # 2. feature eng
    df = feature_eng(df)
    gc.collect()

    # 3. feature selection, drop
    df = feature_selection(df)
    gc.collect()

    # 4. feature_wrangle
    df = feature_wrangle(df)
    gc.collect()

    return df

In [None]:
def train_without_cv(has_multi_val):
    log('train procedure')

    # 1. get final data
    df_train = get_data('train')
    df_val = get_data('val1')
    if has_multi_val:
        df_val2 = get_data('val2')

    gc.collect()

    # build data set
    train_data = lgb.Dataset(df_train.drop(columns='is_attributed'), label=df_train['is_attributed'],
                             categorical_feature=['app', 'channel', 'click_hour'])

    val_data = lgb.Dataset(df_val.drop(columns='is_attributed'), label=df_val['is_attributed'],
                           reference=train_data)

    if has_multi_val:
        val_data2 = lgb.Dataset(df_val2.drop(columns='is_attributed'), label=df_val2['is_attributed'],
                                reference=train_data)

    del df_train, df_val
    if has_multi_val:
        del df_val2

    gc.collect()

    log('start training')

    param = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "learning_rate": 0.07,
        "num_leaves": 90,
        "max_depth": 7,
#         "min_data_in_leaf": 500,
        "subsample": 0.8,
        "subsample_freq": 1,
        "feature_fraction": 0.8,
        # "min_child_weight": 0,
        # "subsample_for_bin": 1000000,
        # "min_split_gain": 0,
#         'max_bin': 200,
#         'min_data_in_bin': 3,
        "reg_lambda": 1,
        'scale_pos_weight': 99.8
    }

    if not has_multi_val:
        bst = lgb.train(param, train_data, num_boost_round=200, valid_sets=[train_data, val_data],
                        valid_names=['train', 'val'], verbose_eval=5, early_stopping_rounds=30)
    else:
        bst = lgb.train(param, train_data, num_boost_round=200, valid_sets=[train_data, val_data, val_data2],
                        valid_names=['train', 'val', 'val2'], verbose_eval=5, early_stopping_rounds=30)

    log('at last: best iteration: ', bst.best_iteration, ', best score:', bst.best_score)
    bst.save_model('lgbmodel.txt', bst.best_iteration)
    gc.collect()

In [None]:
def predict():
    log('predict procedure')
    # 1. load data
    df_test = get_data('test')
    gc.collect()

    log('start predicting')

    bst = lgb.Booster(model_file='lgbmodel.txt')  # init model
    y_pred = bst.predict(data=df_test)

    log('predict done, start writing')
    pd.DataFrame({'click_id': range(len(y_pred)), 'is_attributed': y_pred}).set_index('click_id').to_csv('./preds.csv')

In [None]:
def plot_importance():
    ax = plt.subplot(111)
    plt.interactive(False)
    bst = lgb.Booster(model_file='lgbmodel.txt')  # init model
    lgb.plot_importance(bst, ax, )
    plt.show()

In [None]:
TRAIN = True
HAS_MULTI_VALIDATE = False
RELEASE = True

In [None]:
log(os.listdir("../input/"))

if TRAIN:
    train_without_cv(HAS_MULTI_VALIDATE)
    plot_importance()

if RELEASE:
    TRAIN = False
    predict()