In [1]:
import keras
from keras.preprocessing.text import Tokenizer
import string
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, SpatialDropout1D, LSTM,Input,Conv1D,GlobalMaxPooling1D,BatchNormalization,MaxPooling1D,Flatten
from keras.layers.wrappers import Bidirectional 
from keras.callbacks import ModelCheckpoint,EarlyStopping
import os
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from keras.models import Model
from keras.callbacks import EarlyStopping
import re
%matplotlib inline
import numpy as np
import pandas as pd
from keras.utils import to_categorical
import h5py

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### load pre processed data sets

In [2]:
fit_data = pd.read_pickle('training_set.pkl')

In [3]:
fit_data.shape

(3197922, 27)

In [4]:
fit_data.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'hour', 'min', 'day',
       'app_count', 'device_count', 'os_count', 'channel_count', 'click_count',
       'apps', 'devices', 'oss', 'channels', 'adoc', 'app_count_min',
       'device_count_min', 'os_count_min', 'channel_count_min',
       'click_count_min', 'adoc_min', 'click_count_cumsum',
       'click_count_cumsum_min', 'is_attributed'],
      dtype='object')

In [5]:
def convert_to_cat(df):
    df['device'] = df.device.astype('object')
    df['os'] = df.os.astype('object')
    df['app'] = df.app.astype('object')
    df['channel'] = df.channel.astype('object')
    df['hour'] = df.hour.astype('object')
    df['ip'] = df.ip.astype('object')
    return df

In [6]:
fit_data = convert_to_cat(fit_data)

#### convert categorical into text for word embedding

In [7]:
fit_data['cat'] = fit_data['app'].map(str)+'a '+fit_data['channel'].map(str)+'c '+ fit_data['device'].map(str)+'d '+\
fit_data['os'].map(str)+['os ']

In [8]:
fit_data.head(1)

Unnamed: 0,ip,app,device,os,channel,hour,min,day,app_count,device_count,...,app_count_min,device_count_min,os_count_min,channel_count_min,click_count_min,adoc_min,click_count_cumsum,click_count_cumsum_min,is_attributed,cat
975869,109434,2,1,19,477,16,21,6,1.386294,0.0,...,0.0,0.0,0.0,0.0,0.693147,5.814131,10,0,0,2a 477c 1d 19os


In [9]:
# output directory name:
output_dir = 'model_output/parallel/t50/'
n_unique_words = 10000 
max_text_length = 4 
pad_type = trunc_type = 'pre'

In [10]:
# instantiate tokenizer
t = Tokenizer(num_words = n_unique_words)
# index words on corpus
t.fit_on_texts(fit_data.cat.astype(str))

#### save tokenizer to use on test data later

In [46]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
fit_data['token_vec'] = t.texts_to_sequences(fit_data.cat.astype(str))

In [12]:
def pad_seq(sequence):
    return pad_sequences(sequence,maxlen=max_text_length,dtype='int32',padding='pre',truncating='pre',value=0.)

In [13]:
train,validation = train_test_split(fit_data,test_size=0.3, random_state = 10)

In [14]:
print(train.size,validation.size)

64917805 27821933


In [15]:
validation[validation.is_attributed==1].shape[0]/validation.shape[0]

0.14325129745657858

In [16]:
train[train.is_attributed==1].shape[0]/train.shape[0]

0.14268821935676967

In [17]:
df1 = train[train.is_attributed==1].copy()

In [18]:
frames = [train,df1]
train = pd.concat(frames)

In [19]:
train[train.is_attributed==1].shape[0]/train.shape[0]

0.24974129765176065

In [20]:
train =train.sample(frac=1.0)

In [21]:
del fit_data

#### categorical features

In [22]:
train_cat = pad_seq(train.token_vec)
validation_cat = pad_seq(validation.token_vec)

In [23]:
train.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'hour', 'min', 'day',
       'app_count', 'device_count', 'os_count', 'channel_count', 'click_count',
       'apps', 'devices', 'oss', 'channels', 'adoc', 'app_count_min',
       'device_count_min', 'os_count_min', 'channel_count_min',
       'click_count_min', 'adoc_min', 'click_count_cumsum',
       'click_count_cumsum_min', 'is_attributed', 'cat', 'token_vec'],
      dtype='object')

#### labels

In [24]:
label_train = train.is_attributed.values
label_valid = validation.is_attributed.values

#### numeric features

In [25]:
train_feat = train[['ip','app_count','device_count','os_count','channel_count','click_count','apps',
                 'devices','oss','channels','adoc','app_count_min','device_count_min','os_count_min',
                  'channel_count_min','click_count_min','adoc_min','click_count_cumsum','click_count_cumsum_min'
                 ]].values
validation_feat =validation[['ip','app_count','device_count','os_count','channel_count','click_count','apps',
                 'devices','oss','channels','adoc','app_count_min','device_count_min','os_count_min',
                  'channel_count_min','click_count_min','adoc_min','click_count_cumsum','click_count_cumsum_min'
                 ]].values

In [26]:
del train
del validation

In [27]:
print(train_feat.shape,validation_feat.shape)

(2557959, 19) (959377, 19)


In [28]:
print(train_cat.shape,validation_cat.shape)

(2557959, 4) (959377, 4)


#### nn design

In [29]:
input_cat = Input(shape = (max_text_length,), name = 'input_cat')
# cat ------
cat = Embedding(n_unique_words,4)(input_cat)
cat = SpatialDropout1D(0.1)(cat)
cat = Conv1D(16, 2, activation='relu')(cat)
#cat = Conv1D(128, 3, activation='relu')(cat)
#cat = Conv1D(256, 3, activation='relu')(cat)
#cat = Conv1D(512, 2, activation='relu')(cat)
cat = GlobalMaxPooling1D()(cat)
#output_cat = Dense(2,activation='sigmoid',name='output_cat')(cat)
# num --------------
input_num = Input(shape = (19,),name='input_num')
num = BatchNormalization()(input_num)
num = Dense(64, activation='relu')(num)
#num = BatchNormalization()(num)
#num = Dense(128, activation='relu')(num)
#num = Dense(256, activation='relu')(num)
num = Dropout(0.1)(num)
#output_num = Dense(2,activation='sigmoid',name='output_num')(num)
# comb ------------
x = keras.layers.concatenate([cat, num])
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x)
x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x)
#x = Dense(256, activation='relu')(x)
#x = Dense(512, activation='relu')(x)
#x = Dense(1024, activation='relu')(x)
x = Dropout(0.2)(x)
output_main = Dense(1,activation='sigmoid',name='output_main')(x)

In [30]:
model = Model(inputs=[input_cat, input_num], outputs=[output_main])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_cat (InputLayer)          (None, 4)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 4, 4)         40000       input_cat[0][0]                  
__________________________________________________________________________________________________
input_num (InputLayer)          (None, 19)           0                                            
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 4, 4)         0           embedding_1[0][0]                
__________________________________________________________________________________________________
batch_norm

In [31]:
model.compile(#loss='categorical_crossentropy',
               loss = 'binary_crossentropy',
               #optimizer='RMSprop',
               optimizer='adam',
              #optimizer = sgd,
              metrics=['accuracy'])

In [32]:
output_dir

'model_output/parallel/t50/'

In [33]:
earlystop = EarlyStopping(patience=10)
modelsave = ModelCheckpoint(
    filepath=output_dir+"best_model.hdf5", save_best_only=True, verbose=1)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [34]:
history=model.fit({'input_cat':train_cat, 'input_num':train_feat},
                  {'output_main':label_train},
                  batch_size=4096*8,
                  epochs=100,
                  verbose=1,
                  validation_split=0.2,
                  callbacks=[earlystop, modelsave]
                 )

Train on 2046367 samples, validate on 511592 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.19445, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.19445 to 0.16165, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.16165 to 0.15958, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.15958 to 0.15684, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 5/100

Epoch 00005: val_loss improved from 0.15684 to 0.15096, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 6/100

Epoch 00006: val_loss improved from 0.15096 to 0.14984, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.14984 to 0.14861, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 8/100

Epoch 00008: val_loss improved from 0.14861 to 0.


Epoch 00036: val_loss improved from 0.14367 to 0.14367, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 37/100

Epoch 00037: val_loss did not improve
Epoch 38/100

Epoch 00038: val_loss did not improve
Epoch 39/100

Epoch 00039: val_loss did not improve
Epoch 40/100

Epoch 00040: val_loss did not improve
Epoch 41/100

Epoch 00041: val_loss improved from 0.14367 to 0.14348, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 42/100

Epoch 00042: val_loss did not improve
Epoch 43/100

Epoch 00043: val_loss improved from 0.14348 to 0.14305, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 44/100

Epoch 00044: val_loss did not improve
Epoch 45/100

Epoch 00045: val_loss did not improve
Epoch 46/100

Epoch 00046: val_loss improved from 0.14305 to 0.14289, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 47/100

Epoch 00047: val_loss improved from 0.14289 to 0.14234, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 48/


Epoch 00076: val_loss improved from 0.14047 to 0.14032, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 77/100

Epoch 00077: val_loss did not improve
Epoch 78/100

Epoch 00078: val_loss did not improve
Epoch 79/100

Epoch 00079: val_loss did not improve
Epoch 80/100

Epoch 00080: val_loss did not improve
Epoch 81/100

Epoch 00081: val_loss did not improve
Epoch 82/100

Epoch 00082: val_loss did not improve
Epoch 83/100

Epoch 00083: val_loss did not improve
Epoch 84/100

Epoch 00084: val_loss improved from 0.14032 to 0.14012, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 85/100

Epoch 00085: val_loss did not improve
Epoch 86/100

Epoch 00086: val_loss did not improve
Epoch 87/100

Epoch 00087: val_loss did not improve
Epoch 88/100

Epoch 00088: val_loss did not improve
Epoch 89/100

Epoch 00089: val_loss improved from 0.14012 to 0.14011, saving model to model_output/parallel/t50/best_model.hdf5
Epoch 90/100

Epoch 00090: val_loss did not improve
Epoch

In [45]:
np.save("./train_cat", train_cat,allow_pickle=True)
np.save("./train_feat", train_feat,allow_pickle=True)
np.save("./validation_cat", validation_cat,allow_pickle=True)
np.save("./validation_feat", validation_feat,allow_pickle=True)
np.save("./label_train", label_train,allow_pickle=True)
np.save("./label_valid", label_valid,allow_pickle=True)

#### evaluate on validation set

In [35]:
from keras.models import load_model
model=load_model(output_dir+"best_model.hdf5")

In [36]:
y_p = model.predict({'input_cat':validation_cat, 'input_num':validation_feat})

In [93]:
#y_p = np.argmax(y_p, axis=1)

In [37]:
output_dir

'model_output/parallel/t50/'

In [40]:
y_true = label_valid

In [41]:
from sklearn.metrics import roc_auc_score
auc_train = roc_auc_score(y_true, y_p)

In [97]:
auc_train

0.97274782732077569

In [42]:
auc_train # train4

0.9749821339421685

#### testing set 

In [100]:
100*np.argmax(y,axis=1).sum()/len(y)

0.0

In [101]:
#y_c = np.argmax(y, axis=1)
y = pd.DataFrame(y)
y.columns = ['is_attributed']
y.head()

Unnamed: 0,is_attributed
0,0.013694
1,0.011774
2,0.004334
3,0.003711
4,0.003187


In [102]:
#y_c = np.argmax(y, axis=1)
#y = pd.DataFrame(y_c)
#y.columns = ['is_attributed']
#y.head()

In [103]:
y.shape

(18790469, 1)

In [104]:
results = test[['click_id']].join(y)
results.head()

Unnamed: 0,click_id,is_attributed
0,0,0.013694
1,1,0.011774
2,2,0.004334
3,3,0.003711
4,4,0.003187


In [105]:
results.to_csv(output_dir+'results.csv',index=False)

In [106]:
output_dir

'model_output/parallel/t50/'