In [1]:
!pip install keras pydot graphviz

Collecting keras
  Downloading https://files.pythonhosted.org/packages/54/e8/eaff7a09349ae9bd40d3ebaf028b49f5e2392c771f294910f75bb608b241/Keras-2.1.6-py2.py3-none-any.whl (339kB)
[K    100% |████████████████████████████████| 348kB 1.4MB/s ta 0:00:01
[?25hCollecting pydot
Collecting graphviz
  Using cached https://files.pythonhosted.org/packages/05/e4/8fcc76823534d47f079c0ff1b3d8b57784e8fba63ceb1ded32c9f4dd993c/graphviz-0.8.2-py2.py3-none-any.whl
Installing collected packages: keras, pydot, graphviz
Successfully installed graphviz-0.8.2 keras-2.1.6 pydot-1.2.4
[33mYou are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import os
import gc
import subprocess
import multiprocessing
from datetime import datetime
from pathlib import Path
from tensorflow.python.lib.io import file_io
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [3]:
%%time
main_path = Path('../input')
# Any results you write to the current directory are saved as output.
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32'
    }

total_train_rows = 184903890
train_rows = 40000000
print('Loading train data...')
train_data = pd.read_csv(main_path / 'train.csv', dtype=dtypes, 
                         skiprows=range(1, total_train_rows-train_rows), nrows=train_rows,
                         parse_dates=['click_time'],
                         usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])

print('Loading test data...')
test_data = pd.read_csv(main_path / 'test.csv', dtype=dtypes, parse_dates=['click_time'], 
                        usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id'])


Loading train data...
Loading test data...
CPU times: user 2min 34s, sys: 11.7 s, total: 2min 45s
Wall time: 3min 52s


In [4]:
length_of_train = len(train_data)
train_data = train_data.append(test_data)

# Feature Engineering

In [5]:
print('Extract new time features...')
train_data['hour'] = train_data['click_time'].dt.hour.astype('uint8')
train_data['day'] = train_data['click_time'].dt.day.astype('uint8')
gc.collect()

Extract new time features...


21

In [32]:
  def do_cumcount(data, group_cols, target_col, new_col_name, col_type):
        print('[INFO] Count unique {} with group by {} combination...'.format(target_col, '-'.join(group_cols)))
        cols = group_cols.copy()
        cols.append(target_col)
        group_data = data[cols].groupby(by=group_cols)[[target_col]].cumcount().reset_index().\
            rename(index=str, columns={target_col: new_col_name})
        data[new_col_name] = group_data
        del group_data
        data[new_col_name] = data[new_col_name].astype(col_type)
        gc.collect()
        return data

In [None]:
train_data = do_cumcount(data=train_data, group_cols=['ip'],
                             target_col='channel', new_col_name='uni_ip_with_ch', col_type='uint32')


In [None]:
train_data.head()

In [None]:
# count channel with group by ip, day, hour
print('Count channel with group by ip-day-hour combination...')
group_data = train_data[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_day_hour_count'})
train_data = train_data.merge(group_data, on=['ip', 'day', 'hour'], how='left')
del group_data
gc.collect()

# count channel with group by ip, app
print('Count channel with group by ip-app combination...')
group_data = train_data[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_data = train_data.merge(group_data, on=['ip', 'app'], how='left')
del group_data
gc.collect()

# count channel with group by ip, app, os
print('Count channel with group by ip-app-os combination...')
group_data = train_data[['ip', 'app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_data = train_data.merge(group_data, on=['ip', 'app', 'os'], how='left')
del group_data
gc.collect()

# 以(ip, day, channel)為主，計算hour的變異數
print('Compute var of hour with group by ip-day-channel combination...')
group_data = train_data[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_day_channel_var'})
train_data = train_data.merge(group_data, on=['ip','day','channel'], how='left')
del group_data
gc.collect()

print('Compute var of hour with group by ip-app-os combination...')
group_data = train_data[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'})
train_data = train_data.merge(group_data, on=['ip','app', 'os'], how='left')
del group_data
gc.collect()

# 以(ip, app, channel)為主，計算day的變異數
print('Compute var of day with group by ip-app-channel combination...')
group_data = train_data[['ip','app','channel', 'day']].groupby(by=['ip','app','channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
train_data = train_data.merge(group_data, on=['ip','app','channel'], how='left')
del group_data
gc.collect()

# 以(ip, app, channel)為主，計算hour的平均
print('Compute mean of hour with group by ip-app-channel combination...')
group_data = train_data[['ip','app','channel', 'hour']].groupby(by=['ip','app','channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
train_data = train_data.merge(group_data, on=['ip','app','channel'], how='left')
del group_data
gc.collect()

Count channel with group by ip-day-hour combination...
Count channel with group by ip-app combination...
Count channel with group by ip-app-os combination...
Compute var of hour with group by ip-day-channel combination...
Compute var of hour with group by ip-app-os combination...
Compute var of day with group by ip-app-channel combination...


In [17]:
train_data.columns

Index(['app', 'channel', 'click_id', 'click_time', 'device', 'ip',
       'is_attributed', 'os', 'hour', 'day', 'ip_day_hour_count',
       'ip_app_count', 'ip_app_os_count', 'ip_day_channel_var',
       'ip_app_os_var', 'ip_app_channel_var_day', 'ip_app_channel_mean_hour'],
      dtype='object')

In [16]:
split_probability = 0.1
seed = 202109
test_df = train_data[length_of_train:]
train_df, validation_df = train_test_split(train_data[:length_of_train], 
                                           test_size=split_probability, 
                                           random_state=seed)
#validation_df = train_data[(length_of_train-validation_boundary):length_of_train]
#train_df = train_data[:(length_of_train-validation_boundary)]

print("train size: ", len(train_df))
print("valid size: ", len(validation_df))
print("test size : ", len(test_df))

train size:  36000000
valid size:  4000000
test size :  18790469


In [19]:
target = 'is_attributed'
categorical_features = [col for col in train_df.columns if col in ['app', 'device', 'os', 'channel', 'hour', 'day']]
predictors = list(train_df.columns.get_values()).copy()
predictors.remove(target)
predictors.remove('click_id')


In [20]:
y_train = train_df[target].values

In [21]:
print ('neural network....')
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate
from keras.layers import BatchNormalization, SpatialDropout1D
from keras.callbacks import Callback
from keras.models import Model
from keras.optimizers import Adam

neural network....


Using TensorFlow backend.


In [25]:
test_df.columns

Index(['app', 'channel', 'click_id', 'click_time', 'device', 'ip',
       'is_attributed', 'os', 'hour', 'day', 'ip_day_hour_count',
       'ip_app_count', 'ip_app_os_count', 'ip_day_channel_var',
       'ip_app_os_var', 'ip_app_channel_var_day', 'ip_app_channel_mean_hour'],
      dtype='object')

In [30]:
max_app = np.max([train_df['app'].max(), test_df['app'].max()])+1
max_ch = np.max([train_df['channel'].max(), test_df['channel'].max()])+1
max_dev = np.max([train_df['device'].max(), test_df['device'].max()])+1
max_os = np.max([train_df['os'].max(), test_df['os'].max()])+1
max_h = np.max([train_df['hour'].max(), test_df['hour'].max()])+1
max_d = np.max([train_df['day'].max(), test_df['day'].max()])+1
max_c1 = np.max([train_df['ip_app_count'].max(), test_df['ip_app_count'].max()])+1
max_c2 = np.max([train_df['ip_app_os_count'].max(), test_df['ip_app_os_count'].max()])+1
def get_keras_data(dataset):
    X = {
        'app': np.array(dataset.app),
        'ch': np.array(dataset.channel),
        'dev': np.array(dataset.device),
        'os': np.array(dataset.os),
        'h': np.array(dataset.hour),
        'd': np.array(dataset.day),
        'c1': np.array(dataset.ip_app_count),
        'c2': np.array(dataset.ip_app_os_count)
    }
    return X


In [31]:
train_df = get_keras_data(train_df)

In [32]:
train_df

{'app': array([ 3, 26,  2, ...,  2, 12,  9], dtype=uint16),
 'c1': array([ 790,   55, 2938, ...,   89, 2682,   58]),
 'c2': array([59,  1, 45, ...,  4, 19, 14]),
 'ch': array([137, 121, 205, ..., 401, 259, 232], dtype=uint16),
 'd': array([9, 9, 9, ..., 9, 9, 9], dtype=uint8),
 'dev': array([1, 1, 1, ..., 1, 1, 1], dtype=uint16),
 'h': array([ 5,  4, 13, ...,  4, 13,  7], dtype=uint8),
 'os': array([22, 10, 23, ..., 53, 28, 15], dtype=uint16)}

In [37]:
emb_n = 50
dense_n = 1000
input_app = Input(shape=[1], name='app')
embending_app = Embedding(max_app, emb_n)(input_app)

input_ch = Input(shape=[1], name='ch')
embending_ch = Embedding(max_ch, emb_n)(input_ch)

input_dev = Input(shape=[1], name='dev')
embending_dev = Embedding(max_dev, emb_n)(input_dev)

input_os = Input(shape=[1], name='os')
embending_os = Embedding(max_os, emb_n)(input_os)

input_h = Input(shape=[1], name='h')
embending_h = Embedding(max_h, emb_n)(input_h)

input_d = Input(shape=[1], name='d')
embending_d = Embedding(max_d, emb_n)(input_d)

input_c1 = Input(shape=[1], name='c1')
embending_c1 = Embedding(max_c1, emb_n)(input_c1)

input_c2 = Input(shape=[1], name='c2')
embending_c2 = Embedding(max_c2, emb_n)(input_c2)


In [38]:
fe = concatenate([(embending_app), (embending_ch), (embending_dev), (embending_os), (embending_h),
                  (embending_d), (embending_c1), (embending_c2)])
s_drop_out = SpatialDropout1D(0.2)(fe)

In [39]:
x = Flatten()(s_drop_out)
x = Dropout(0.2)(Dense(dense_n, activation='relu')(x))
x = Dropout(0.2)(Dense(dense_n, activation='relu')(x))
output_result = Dense(1, activation='sigmoid')(x)
model = Model(input=[input_app, input_ch, input_dev, input_os, input_h, input_d, input_c1, input_c2], 
              output=output_result)



In [41]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
app (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
ch (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
dev (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
os (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
h (InputLa