In [1]:
import itertools
import numpy as np
import pandas as pd
import gc

In [2]:
def count_agg(df, group_cols):
    col_name = "_".join(group_cols)+'_count'
    count = df.groupby(group_cols).size().reset_index(name=col_name)
    df = df.merge(count, on=group_cols, how='left')
    del count
    gc.collect()
    return df

def count_cum(df, group_cols):
    col_name = "_".join(group_cols)+'_countAccum'
    df[col_name] = df.groupby(group_cols).cumcount()
    gc.collect()
    return df

def count_uniq(df, group_cols, uniq_col):
    col_name = "_".join(group_cols)+'_uniq_'+uniq_col+'_countUniq'
    tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name)
    df = df.merge(tmp, on=group_cols, how='left')
    del tmp
    gc.collect()
    return df

def next_click(df, group_cols):
    df["_".join(group_cols)+'_nextClick'] = (df.groupby(group_cols).click_time.shift(-1) - df.click_time).astype(np.float32)
    gc.collect()
    return df

In [None]:
dtype = {
    'ip': np.int32,
    'app': np.int16,
    'device': np.int16,
    'os': np.int16,
    'channel': np.int16,
    'click_time': object,
    'is_attributed': np.int16
}

df = pd.read_csv('train.csv', dtype=dtype, usecols=dtype.keys(), parse_dates=['click_time'])

In [None]:
# times
print('generating time features...')
df['click_time']= pd.to_datetime(df['click_time'])
df['day'] = df['click_time'].dt.day.astype('uint8')
df['hour'] = df['click_time'].dt.hour.astype('uint8')
df['click_time'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
df['index'] = df.index
print('done')

In [None]:
train = df[df.day != 9]
valid = df[df.day == 9]

In [None]:
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
print('dumping')
train.to_feather('train.ftr')
valid.to_feather('valid.ftr')
print('done')

# Feature Engineering

## count

In [None]:
print('loading...')
df = pd.read_feather('valid.ftr')
print('Done')
# count agg features
count_combinations = [
    ['app'],
    ['ip'],
    ['app', 'channel'],
    ['ip', 'device'],
    ['ip', 'day'],
    ['app', 'channel', 'hour'],
    ['app', 'channel', 'day'],
    ['app', 'channel', 'day', 'hour']
]
for i, cols in enumerate(count_combinations):
    print(i, cols)
    df = count_agg(df, cols)

In [None]:
feats = [
    'index', 
    'app_count', 
    'ip_count', 
    'app_channel_count',
    'ip_device_count', 
    'ip_day_count', 
    'app_channel_hour_count',
    'app_channel_day_count', 
    'app_channel_day_hour_count'
]
df = df[feats]
df.to_feather('valid_count.ftr')

## agg

In [None]:
print('loading...')
df = pd.read_feather('valid.ftr')
print('Done')
# accumulate count agg features
countAccum_combinations = [
    ['ip'],
    ['channel'],
    ['app'],
    ['device'],
    ['app', 'channel'],
    ['app', 'channel', 'day'],
    ['channel', 'day', 'hour'],
    ['device', 'channel', 'day', 'hour'],
    ['app', 'channel', 'day', 'hour'],
    ['app', 'device', 'channel', 'day', 'hour'],
    ['ip', 'day'],
    ['ip', 'device']
]

df = df.sort_values(by=['click_time'])
for i, cols in enumerate(countAccum_combinations):
    print(i, cols)
    df = count_cum(df, cols)

In [None]:
feats = [
    'index', 
    'ip_countAccum', 
    'channel_countAccum',
    'app_countAccum', 
    'device_countAccum', 
    'app_channel_countAccum',
    'app_channel_day_countAccum', 
    'channel_day_hour_countAccum',
    'device_channel_day_hour_countAccum', 
    'app_channel_day_hour_countAccum',
    'app_device_channel_day_hour_countAccum', 
    'ip_day_countAccum',
    'ip_device_countAccum'
]
df = df[feats]
df = df.reset_index(drop=True)
df.to_feather('valid_accum.ftr')

## unique

In [None]:
print('loading...')
df = pd.read_feather('valid.ftr')
print('Done')
# unique count agg features
countUniq_combinations = [
    [['app'], 'ip'],
    [['app', 'day'], 'ip'],
    [['app', 'device', 'channel'], 'ip'],
    [['app', 'hour', 'channel'], 'ip'],
    [['ip'], 'channel'],
    [['ip'], 'app'],
    [['ip'], 'hour'],
    [['ip'], 'os'],
    [['app', 'channel', 'hour'], 'os'],
    [['app', 'channel', 'day', 'hour'], 'os'],
]
for i, cols in enumerate(countUniq_combinations):
    print(i, cols)
    df = count_uniq(df, cols[0], cols[1])

In [None]:
feats = [
    'index', 
    'app_uniq_ip_countUniq',
    'app_day_uniq_ip_countUniq', 
    'app_device_channel_uniq_ip_countUniq',
    'app_hour_channel_uniq_ip_countUniq', 
    'ip_uniq_channel_countUniq',
    'ip_uniq_app_countUniq', 
    'ip_uniq_hour_countUniq',
    'ip_uniq_os_countUniq', 
    'app_channel_hour_uniq_os_countUniq',
    'app_channel_day_hour_uniq_os_countUniq'
]
df = df[feats]
df = df.reset_index(drop=True)
df.to_feather('valid_uniq.ftr')

In [3]:
print('loading...')
df = pd.read_feather('valid.ftr')
print('Done')
# next click features
next_click_combinations = [
    ['ip'],
    ['channel'],
    ['ip', 'device'],
    ['channel', 'day'],
    ['app', 'channel'],
    ['ip', 'app'],
    ['ip', 'app', 'os'],
    ['ip', 'app', 'os', 'device'],
    ['ip', 'app', 'os', 'device', 'channel'],
]
for i, cols in enumerate(next_click_combinations):
    print(i, cols)
    df = next_click(df, cols)

loading...
Done
0 ['ip']
1 ['channel']
2 ['ip', 'device']
3 ['channel', 'day']
4 ['app', 'channel']
5 ['ip', 'app']
6 ['ip', 'app', 'os']
7 ['ip', 'app', 'os', 'device']
8 ['ip', 'app', 'os', 'device', 'channel']


In [4]:
feats = [
    'index', 
    'ip_nextClick', 
    'channel_nextClick',
    'ip_device_nextClick', 
    'channel_day_nextClick', 
    'app_channel_nextClick',
    'ip_app_nextClick', 
    'ip_app_os_nextClick', 
    'ip_app_os_device_nextClick',
    'ip_app_os_device_channel_nextClick'
]
df = df[feats]
df = df.reset_index(drop=True)
df.to_feather('valid_nextClick.ftr')