In [None]:
import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd
import tensorflow as tf
import random

seed=53
tf.random.set_seed(seed)
random.seed(seed)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

print(tf.__version__)

In [None]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'object',
        'device'        : 'object',
        'os'            : 'object',
        'channel'       : 'object',
        'click_time'    : 'object',
        'is_attributed' : 'uint8',
        }

In [None]:
train_dd = dd.read_csv('/kaggle/input/talkingdata-adtracking-fraud-detection/train.csv', dtype=dtypes,
                      usecols=['ip','app','device','os','channel','click_time','is_attributed'])

In [None]:
train_dd.info()

In [None]:
train_dd.head()

In [None]:
train_dd, val_df = train_dd.random_split([0.9, 0.1], random_state=seed, shuffle=True) 
print(len(train_dd))
print(len(val_df))

In [None]:
df = train_dd[train_dd.is_attributed == 1] 

In [None]:
type(df)

In [None]:
df = df.compute()

In [None]:
type(df)

In [None]:
len(df)

In [None]:
df.shape

In [None]:
df2 = train_dd[train_dd.is_attributed == 0] 

In [None]:
df2 = df2.sample(frac=0.0025)

In [None]:
df2 = df2.compute()

In [None]:
df2.shape

In [None]:
df2.sample(10)

In [None]:
train_df = pd.concat([df, df2])
train_dd = None

In [None]:
train_df.is_attributed.value_counts()

In [None]:
train_df.info()

In [None]:
val_df = val_df.compute()

In [None]:
val_df.is_attributed.value_counts()

In [None]:
# distribution of 1 in validaton set
print(46127/(18445145 + 46127))
# distribution of 1 in training set (prior to down sampling)
print(410719/166412618)

In [None]:
train_df['click_time'] = pd.to_datetime(train_df['click_time'], infer_datetime_format=True)
val_df['click_time'] = pd.to_datetime(val_df['click_time'], infer_datetime_format=True)

In [None]:
train_df['day'] = train_df['click_time'].dt.day.astype(str)
train_df['hour'] = train_df['click_time'].dt.hour.astype(str)
train_df['minute'] = train_df['click_time'].dt.minute.astype(str)
train_df['second'] = train_df['click_time'].dt.second.astype(str)

val_df['day'] = val_df['click_time'].dt.day.astype(str)
val_df['hour'] = val_df['click_time'].dt.hour.astype(str)
val_df['minute'] = val_df['click_time'].dt.minute.astype(str)
val_df['second'] = val_df['click_time'].dt.second.astype(str)

In [None]:
train_df.drop(['click_time'], axis='columns', inplace=True)
val_df.drop(['click_time'], axis='columns', inplace=True)

In [None]:
train_df.info()

In [None]:
train_df.sample(10)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32, labels=True):
    dataframe = dataframe.copy()
    if labels:
        labels = dataframe.pop('is_attributed')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    else:
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
train_ds = df_to_dataset(train_df, batch_size=128).prefetch(tf.data.experimental.AUTOTUNE)
val_ds = df_to_dataset(val_df, shuffle=False, batch_size=128).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
from tensorflow import feature_column

feature_columns = []

# numeric cols
for col in ['ip']:
    feature_columns.append(feature_column.numeric_column(col))

#app, device, os, channel as categorical treatment

# embedding columns
app = feature_column.categorical_column_with_vocabulary_list(
      'app', train_df.app.unique())
app_embedding = feature_column.embedding_column(app, dimension=64)
feature_columns.append(app_embedding)

os = feature_column.categorical_column_with_vocabulary_list(
      'os', train_df.os.unique())
os_embedding = feature_column.embedding_column(os, dimension=32)
feature_columns.append(os_embedding)

device = feature_column.categorical_column_with_vocabulary_list(
      'device', train_df.device.unique())
device_embedding = feature_column.embedding_column(device, dimension=32)
feature_columns.append(device_embedding)

channel = feature_column.categorical_column_with_vocabulary_list(
      'channel', train_df.channel.unique())
channel_embedding = feature_column.embedding_column(channel, dimension=32)
feature_columns.append(channel_embedding)

day = feature_column.categorical_column_with_vocabulary_list(
      'day', train_df.day.unique())
day_embedding = feature_column.embedding_column(day, dimension=8)
feature_columns.append(day_embedding)

hour = feature_column.categorical_column_with_vocabulary_list(
      'hour', train_df.hour.unique())
hour_embedding = feature_column.embedding_column(hour, dimension=8)
feature_columns.append(hour_embedding)

minute = feature_column.categorical_column_with_vocabulary_list(
      'minute', train_df.minute.unique())
minute_embedding = feature_column.embedding_column(minute, dimension=8)
feature_columns.append(minute_embedding)

second = feature_column.categorical_column_with_vocabulary_list(
      'second', train_df.second.unique())
second_embedding = feature_column.embedding_column(second, dimension=8)
feature_columns.append(second_embedding)

In [None]:
train_df = None
val_df=None

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
tf.keras.backend.clear_session()
from tensorflow.keras.layers import Dense
model = tf.keras.Sequential([
  feature_layer,
  Dense(128, activation='relu'),
  Dense(128, activation='relu'),
  Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
             loss='bce',
             metrics=[tf.keras.metrics.AUC(name='auc')])

In [None]:
es = tf.keras.callbacks.EarlyStopping(patience=10, verbose=1, restore_best_weights=True)

history = model.fit(train_ds, epochs=20, validation_data=val_ds, callbacks=[es], verbose=2)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

history = history.history

fig, (ax1, ax2) = plt.subplots(2, 1, sharex='col', figsize=(20, 14))

ax1.plot(history['loss'], label='Training')
ax1.plot(history['val_loss'], label='Validation')
ax1.legend(loc='best')
ax1.set_title('Loss')

ax2.plot(history['auc'], label='Training')
ax2.plot(history['val_auc'], label='Validation')
ax2.legend(loc='best')
ax2.set_title('AUC')

plt.xlabel('Epochs')
sns.despine()
plt.show()

In [None]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'object',
        'device'        : 'object',
        'os'            : 'object',
        'channel'       : 'object',
        'click_time'    : 'object',
        }

In [None]:
test_df = pd.read_csv('/kaggle/input/talkingdata-adtracking-fraud-detection/test.csv', dtype=dtypes,
                     usecols=['ip','app','device','os','channel','click_time'])
test_df.info()

Partition test set so it fits in memory.

In [None]:
print(test_df.shape)
total = test_df.shape[0]
h = int(total/3)
test_df1 = test_df.iloc[0:h]

In [None]:
test_df1['click_time'] = pd.to_datetime(test_df1['click_time'], infer_datetime_format=True)
test_df1['day'] = test_df1['click_time'].dt.day.astype(str)
test_df1['hour'] = test_df1['click_time'].dt.hour.astype(str)
test_df1['minute'] = test_df1['click_time'].dt.minute.astype(str)
test_df1['second'] = test_df1['click_time'].dt.second.astype(str)

test_df1.drop(['click_time'], axis='columns', inplace=True)

In [None]:
test_ds = df_to_dataset(test_df1, shuffle=False, batch_size=64, labels=False).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
predictions1 = model.predict(test_ds, verbose=1)

In [None]:
test_df1 = test_df.iloc[h:2*h]

test_df1['click_time'] = pd.to_datetime(test_df1['click_time'], infer_datetime_format=True)
test_df1['day'] = test_df1['click_time'].dt.day.astype(str)
test_df1['hour'] = test_df1['click_time'].dt.hour.astype(str)
test_df1['minute'] = test_df1['click_time'].dt.minute.astype(str)
test_df1['second'] = test_df1['click_time'].dt.second.astype(str)

test_df1.drop(['click_time'], axis='columns', inplace=True)

In [None]:
test_ds = df_to_dataset(test_df1, shuffle=False, batch_size=128, labels=False).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
predictions2 = model.predict(test_ds, verbose=1)

In [None]:
test_df1 = test_df.iloc[2*h:]

test_df1['click_time'] = pd.to_datetime(test_df1['click_time'], infer_datetime_format=True)
test_df1['day'] = test_df1['click_time'].dt.day.astype(str)
test_df1['hour'] = test_df1['click_time'].dt.hour.astype(str)
test_df1['minute'] = test_df1['click_time'].dt.minute.astype(str)
test_df1['second'] = test_df1['click_time'].dt.second.astype(str)

test_df1.drop(['click_time'], axis='columns', inplace=True)

In [None]:
test_ds = df_to_dataset(test_df1, shuffle=False, batch_size=128, labels=False).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
predictions3 = model.predict(test_ds, verbose=1)

In [None]:
predictions = np.append(predictions1, predictions2)
predictions = np.append(predictions, predictions3)

In [None]:
print(test_df.shape)
print(predictions.shape)

In [None]:
submission_df = pd.read_csv('/kaggle/input/talkingdata-adtracking-fraud-detection/sample_submission.csv')
submission_df['is_attributed'] = predictions
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df.head()