In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Explore the sample dataset with pandas

In [None]:
df = pd.read_csv('/kaggle/input/talkingdata-adtracking-fraud-detection/train_sample.csv')
print(df.shape)
print(df.info())
df.head()

In [None]:
df['click_time_parsed'] = pd.to_datetime(df['click_time'])
df['year'] = df['click_time_parsed'].dt.year
df['month'] = df['click_time_parsed'].dt.month
df['day'] = df['click_time_parsed'].dt.day
df['hour'] = df['click_time_parsed'].dt.hour
df.head()

In [None]:
df.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler

cols = ["ip", "app", "device", "os", "channel", "day", "hour"]

scaler = MinMaxScaler()
scaler.fit(df[cols])

X_min = scaler.data_min_
X_max = scaler.data_max_

print(X_min)
print(X_max)

# Train - Valid split

In [None]:
filepath = "/kaggle/input/talkingdata-adtracking-fraud-detection/train_sample.csv"
#filepath = "/kaggle/input/talkingdata-adtracking-fraud-detection/train.csv"

train_path='train.csv'
valid_path='valid.csv'

if os.path.exists(train_path):
    os.remove(train_path)
    
if os.path.exists(valid_path):
    os.remove(valid_path)

In [None]:
import csv
split = 10000

if split:
    with open(filepath) as f:
        reader = csv.reader(f, delimiter=',')
        first_line = True
        count = 0
        for row in reader:
            if first_line:
                first_line = False

                with open(train_path, 'wt', encoding='utf-8') as train_file:
                    csv_writer = csv.writer(train_file, delimiter=',')
                    csv_writer.writerow(row)

                with open(valid_path, 'wt', encoding='utf-8') as valid_file:
                    csv_writer = csv.writer(valid_file, delimiter=',')
                    csv_writer.writerow(row)

            else:
                count +=1
                if count<=split:
                    with open(valid_path, 'a', encoding='utf-8') as valid_file:
                        csv_writer = csv.writer(valid_file)
                        csv_writer.writerow(row)
                else:
                    with open(train_path, 'a', encoding='utf-8') as train_file:
                        csv_writer = csv.writer(train_file)
                        csv_writer.writerow(row)

# keras Data API

In [None]:
import tensorflow as tf

dataset = tf.data.TextLineDataset(train_path).skip(1)

for line in dataset.take(5):
    print(line.numpy())

In [None]:
from datetime import datetime

def preprocess(line):
    defs = [0]*5 + [""]*2 + [tf.constant([], dtype=tf.string)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    
    # parse date
    day = tf.strings.to_number(tf.strings.substr(fields[5], 8, len=2), tf.int32)
    hour = tf.strings.to_number(tf.strings.substr(fields[5], 11, len=2), tf.int32)
    
    # features
    features = fields[:-3]
    features.append(day)
    features.append(hour)
    
    x = tf.stack(features)
    y = tf.stack(fields[-1])
    #return x, y
    return (x - X_min) / (X_max - X_min), y=='1'
    
test = b'83230,3,1,13,379,2017-11-06 14:32:21,,0'
preprocess(test)

In [None]:
dataset = tf.data.TextLineDataset(filepath).skip(1)
dataset = dataset.map(preprocess)

for line in dataset.take(1):
    print(line)

In [None]:
def csv_reader_dataset(filepath, shuffle_buffer_size=10000, batch_size=256, n_parse_threads=5):
    dataset = tf.data.TextLineDataset(filepath).skip(1)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    return dataset.batch(batch_size).prefetch(1)

# Deep Learning

In [None]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

train_set = csv_reader_dataset(train_path)
valid_set = csv_reader_dataset(valid_path)

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(50, activation='relu', input_shape=[7]),
    tf.keras.layers.Dense(30, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC()])

In [None]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)

model.fit(train_set, epochs=100, validation_data=valid_set, callbacks=[early_stopping_cb])

# Make Predictions

In [None]:
test_path ='/kaggle/input/talkingdata-adtracking-fraud-detection/test.csv'

dataset = tf.data.TextLineDataset(test_path).skip(1)
for line in dataset.take(5):
    print(line.numpy())

In [None]:
def preprocess_forecast(line):
    defs = [0]*6 + [""] 
    fields = tf.io.decode_csv(line, record_defaults=defs)
    
    # parse date
    day = tf.strings.to_number(tf.strings.substr(fields[-1], 8, len=2), tf.int32)
    hour = tf.strings.to_number(tf.strings.substr(fields[-1], 11, len=2), tf.int32)
    
    # features
    features = fields[1:-1]
    features.append(day)
    features.append(hour)
    
    x = tf.stack(features)
    return (x - X_min) / (X_max - X_min)
    

dataset = tf.data.TextLineDataset(test_path).skip(1)
dataset = dataset.map(preprocess_forecast)

for line in dataset.take(1):
    print(line)

In [None]:
def model_forecast(model, filepath):
    dataset = tf.data.TextLineDataset(filepath).skip(1)
    dataset = dataset.map(preprocess_forecast, num_parallel_calls=5)
    dataset =  dataset.batch(256).prefetch(1)
    prediction = model.predict(dataset)
    return prediction

y_pred = model_forecast(model, test_path)
y_pred.shape

In [None]:
df = pd.read_csv(test_path)
df['is_attributed'] = y_pred
df[['click_id','is_attributed']].to_csv('submission.csv', index=False)