In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split
print(tf.__version__)

1.11.0


Task: predict the whether an ad was clicked on based on 39 anonymized factors

 - Over 45 million training examples, ~5GB

Features: 13 integer columns, followed by 26 columns of 32 bit hashed values

- Labels: Just a 0 or 1 :)

In [2]:
TRAINING_VARS = ['City', 'State', 'Country', 'Age', 'bookTitle', 'bookAuthor', 'yearOfPublication']
TARGET_VAR = ['label']

In [3]:
# load training dataset
data = pd.read_csv("./BookCrossing/data.csv")
print(data.shape)
print(data.isnull().mean())

# (661724, 8)
# City                 0.002543
# State                0.018819
# Country              0.012921
# Age                  0.000000
# bookTitle            0.000000
# bookAuthor           0.000000
# yearOfPublication    0.000000
# label                0.000000
# dtype: float64

(540629, 8)
City                 0.000723
State                0.010460
Country              0.009515
Age                  0.000000
bookTitle            0.000000
bookAuthor           0.000000
yearOfPublication    0.000000
label                0.000000
dtype: float64


In [4]:
data.head()

Unnamed: 0,City,State,Country,Age,bookTitle,bookAuthor,yearOfPublication,label
0,kln,nordrheinwestfalen,germany,27.0,along came a spider alex cross novels,james patterson,1993.0,0.0
1,kln,nordrheinwestfalen,germany,27.0,schlafes bruder,robert schneider,1994.0,0.0
2,kln,nordrheinwestfalen,germany,27.0,der stein der kelten,michael phillips,2001.0,1.0
3,kln,nordrheinwestfalen,germany,27.0,nordermoor,arnaldur indridason,2003.0,1.0
4,kln,nordrheinwestfalen,germany,27.0,nur der tod ist ohne makel,ann granger,2002.0,1.0


In [5]:
print("Unique Cities: {0} Embedding size: {1}".format(len(data.City.unique()), len(data.City.unique())** 0.25))
print("Unique State: {0} Embedding size: {1}".format(len(data.State.unique()), len(data.State.unique())** 0.25))
print("Unique Country: {0} Embedding size: {1}".format(len(data.Country.unique()), len(data.Country.unique())** 0.25))
print("Unique bookTitle: {0} Embedding size: {1}".format(len(data.bookTitle.unique()), len(data.bookTitle.unique())** 0.25))
print("Unique bookAuthor: {0} Embedding size: {1}".format(len(data.bookAuthor.unique()), len(data.bookAuthor.unique())** 0.25))

Unique Cities: 1465 Embedding size: 6.1867049726012695
Unique State: 294 Embedding size: 4.140824579655874
Unique Country: 58 Embedding size: 2.7596690210718946
Unique bookTitle: 167967 Embedding size: 20.24445041939468
Unique bookAuthor: 69785 Embedding size: 16.253261421281003


In [6]:
data = data.dropna()
print(data.shape)
print(data.isnull().sum())

(533040, 8)
City                 0
State                0
Country              0
Age                  0
bookTitle            0
bookAuthor           0
yearOfPublication    0
label                0
dtype: int64


In [7]:
data.yearOfPublication = data.yearOfPublication.astype('int64')
data.Age = data.Age.astype('int64')

In [8]:
data.head()

Unnamed: 0,City,State,Country,Age,bookTitle,bookAuthor,yearOfPublication,label
0,kln,nordrheinwestfalen,germany,27,along came a spider alex cross novels,james patterson,1993,0.0
1,kln,nordrheinwestfalen,germany,27,schlafes bruder,robert schneider,1994,0.0
2,kln,nordrheinwestfalen,germany,27,der stein der kelten,michael phillips,2001,1.0
3,kln,nordrheinwestfalen,germany,27,nordermoor,arnaldur indridason,2003,1.0
4,kln,nordrheinwestfalen,germany,27,nur der tod ist ohne makel,ann granger,2002,1.0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(data[TRAINING_VARS], data[TARGET_VAR], test_size=0.10, random_state=42)

In [10]:
print(X_train.shape)
print(y_train.shape)

(479736, 7)
(479736, 1)


In [11]:
city_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="City", hash_bucket_size=500)
state_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="State", hash_bucket_size=200) 
country_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="Country", hash_bucket_size=100) 
bookTitle_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="bookTitle", hash_bucket_size=1000) 
bookAuthor_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="bookAuthor", hash_bucket_size=800) 

In [12]:
# Defaults to a tf.float32 scalar.
yearOfPublication_numeric = tf.feature_column.numeric_column(key="yearOfPublication")
age_numeric = tf.feature_column.numeric_column(key="Age")

In [13]:
categorical_columns = []
categorical_columns.append(tf.feature_column.bucketized_column(source_column=yearOfPublication_numeric, boundaries=[1980,1990,1995,2000,2005]))
categorical_columns.append(tf.feature_column.bucketized_column(source_column=age_numeric, boundaries=[15,20,25,30,35,40,45,50,55]))

In [14]:
embedding_columns = []
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=city_categorical, dimension=8))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=state_categorical, dimension=8))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=country_categorical, dimension=8))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=bookTitle_categorical, dimension=20))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=bookAuthor_categorical, dimension=16))

In [15]:
## Feature crossing
# categorical_columns[1]: Age categorical
# Age="16" AND bookTitle="Harry Potter"
crossed_features = []
crossed_features.append(tf.feature_column.crossed_column([categorical_columns[1], "bookTitle"], 
                                                         hash_bucket_size=int(1e4)))

# Age="16" AND bookAuthor="JK Rowling"
crossed_features.append(tf.feature_column.crossed_column([categorical_columns[1], "bookAuthor"], 
                                                         hash_bucket_size=int(1e4)))

# Country="United States" AND bookTitle="Homo Deus"
crossed_features.append(tf.feature_column.crossed_column(["Country", "bookTitle"], 
                                                         hash_bucket_size=int(1e4)))

In [16]:
# Embeddings for wide columns into deep columns
for col in categorical_columns:
    embedding_columns.append(tf.feature_column.embedding_column(col, dimension=8))

In [17]:
def train_input_fn(features, labels, batch_size=512):
    # Create the dataset object and return it
    # Make sure you shuffle and define the batch size as given by the 'batch_size' parameter
    
    #   dict(features) -->
    #   {"clicked": [0,1,0,0,0,0...],
    #    "I1": [1,2,0,0,0...],
    #    "I2": [0,499,4,38,1],
    #    ...}
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.repeat(100)
    dataset = dataset.batch(batch_size)
    return dataset # return the dataset object

In [18]:
# train_input_fn = tf.estimator.inputs.pandas_input_fn(
#             x=X_train,
#             y=y_train['clicked'],
#             batch_size=64,
#             num_epochs=2,
#             shuffle=True,
#             queue_capacity=1000,
#             num_threads=1,
#             target_column='clicked')
# #map(column_name => [Tensor of values]) , [Tensor of labels])

In [19]:
# Data transformations
# crossed_1 = tf.contrib.layers.crossed_column([wide_columns[0], wide_columns[1]], 
#                                                          hash_bucket_size=int(1e4))
# wide_columns.append(crossed_1)

In [20]:
pid = os.getpid()
estimator = tf.estimator.DNNLinearCombinedClassifier(
    model_dir="./checkpoints/" + str(pid),
    # wide settings
    linear_feature_columns=categorical_columns + crossed_features,
    linear_optimizer=tf.train.FtrlOptimizer(learning_rate=0.01),
    # deep settings
    dnn_feature_columns=embedding_columns,
    dnn_hidden_units=[512, 256, 512, 128],
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.01),
    dnn_activation_fn=tf.nn.relu)


# estimator = tf.estimator.DNNClassifier(
#     model_dir="./checkpoints/" + str(pid),
#     feature_columns=categorical_columns + embedding_columns,
#     hidden_units=[1024, 512, 1024, 256],
#     n_classes=2,
#     optimizer=tf.train.ProximalAdagradOptimizer(
#       learning_rate=0.1,
#     ))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './checkpoints/31358', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f12cdccd748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [21]:
# LOCAL. Update these paths as appropriate
# train_file = "./criteo/train_medium.csv"
# eval_file  = "./criteo/eval_small.csv"

In [None]:
print("Starting experiment:", pid)
estimator.train(input_fn=lambda: train_input_fn(X_train, y_train, batch_size=256))

Starting experiment: 31358
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./checkpoints/31358/model.ckpt.
INFO:tensorflow:loss = 175.19525, step = 1
INFO:tensorflow:global_step/sec: 66.0993
INFO:tensorflow:loss = 152.44824, step = 101 (1.514 sec)
INFO:tensorflow:global_step/sec: 120.505
INFO:tensorflow:loss = 144.86249, step = 201 (0.830 sec)
INFO:tensorflow:global_step/sec: 120.485
INFO:tensorflow:loss = 147.47821, step = 301 (0.830 sec)
INFO:tensorflow:global_step/sec: 120.475
INFO:tensorflow:loss = 133.95413, step = 401 (0.830 sec)
INFO:tensorflow:global_step/sec: 122.298
INFO:tensorflow:loss = 130.34578, step = 501 (0.818 sec)
INFO:tensorflow:global_step/sec: 124.674
INFO:tensorflow:loss = 134.41974, step = 601 (0.802 sec)
INFO:tensorflow:global_step

INFO:tensorflow:global_step/sec: 119.785
INFO:tensorflow:loss = 119.480225, step = 8001 (0.835 sec)
INFO:tensorflow:global_step/sec: 116.561
INFO:tensorflow:loss = 122.79916, step = 8101 (0.858 sec)
INFO:tensorflow:global_step/sec: 117.034
INFO:tensorflow:loss = 117.308685, step = 8201 (0.855 sec)
INFO:tensorflow:global_step/sec: 125.113
INFO:tensorflow:loss = 119.538025, step = 8301 (0.799 sec)
INFO:tensorflow:global_step/sec: 108.472
INFO:tensorflow:loss = 102.12915, step = 8401 (0.922 sec)
INFO:tensorflow:global_step/sec: 118.349
INFO:tensorflow:loss = 118.35342, step = 8501 (0.845 sec)
INFO:tensorflow:global_step/sec: 121.875
INFO:tensorflow:loss = 112.38831, step = 8601 (0.821 sec)
INFO:tensorflow:global_step/sec: 125.444
INFO:tensorflow:loss = 120.17789, step = 8701 (0.797 sec)
INFO:tensorflow:global_step/sec: 121.616
INFO:tensorflow:loss = 110.793915, step = 8801 (0.822 sec)
INFO:tensorflow:global_step/sec: 119.294
INFO:tensorflow:loss = 112.47542, step = 8901 (0.838 sec)
INFO:t

INFO:tensorflow:loss = 84.68527, step = 16201 (0.900 sec)
INFO:tensorflow:global_step/sec: 123.568
INFO:tensorflow:loss = 101.447105, step = 16301 (0.809 sec)
INFO:tensorflow:global_step/sec: 123.815
INFO:tensorflow:loss = 106.971, step = 16401 (0.808 sec)
INFO:tensorflow:global_step/sec: 124.46
INFO:tensorflow:loss = 92.37135, step = 16501 (0.804 sec)
INFO:tensorflow:global_step/sec: 125.892
INFO:tensorflow:loss = 92.25672, step = 16601 (0.794 sec)
INFO:tensorflow:global_step/sec: 125.919
INFO:tensorflow:loss = 91.34308, step = 16701 (0.794 sec)
INFO:tensorflow:global_step/sec: 124.962
INFO:tensorflow:loss = 85.60351, step = 16801 (0.800 sec)
INFO:tensorflow:global_step/sec: 125.532
INFO:tensorflow:loss = 94.85093, step = 16901 (0.798 sec)
INFO:tensorflow:global_step/sec: 125.629
INFO:tensorflow:loss = 109.121124, step = 17001 (0.795 sec)
INFO:tensorflow:global_step/sec: 124.06
INFO:tensorflow:loss = 113.59602, step = 17101 (0.806 sec)
INFO:tensorflow:global_step/sec: 125.886
INFO:ten

INFO:tensorflow:global_step/sec: 124.825
INFO:tensorflow:loss = 95.25693, step = 24501 (0.801 sec)
INFO:tensorflow:global_step/sec: 125.266
INFO:tensorflow:loss = 94.054436, step = 24601 (0.798 sec)
INFO:tensorflow:global_step/sec: 124.839
INFO:tensorflow:loss = 84.18171, step = 24701 (0.801 sec)
INFO:tensorflow:global_step/sec: 124.33
INFO:tensorflow:loss = 94.773544, step = 24801 (0.804 sec)
INFO:tensorflow:global_step/sec: 126.495
INFO:tensorflow:loss = 83.34619, step = 24901 (0.792 sec)
INFO:tensorflow:global_step/sec: 127.068
INFO:tensorflow:loss = 90.676575, step = 25001 (0.786 sec)
INFO:tensorflow:global_step/sec: 125.365
INFO:tensorflow:loss = 98.491776, step = 25101 (0.798 sec)
INFO:tensorflow:global_step/sec: 124.962
INFO:tensorflow:loss = 108.23117, step = 25201 (0.800 sec)
INFO:tensorflow:global_step/sec: 123.13
INFO:tensorflow:loss = 86.31128, step = 25301 (0.812 sec)
INFO:tensorflow:global_step/sec: 125.474
INFO:tensorflow:loss = 80.87108, step = 25401 (0.797 sec)
INFO:te

INFO:tensorflow:global_step/sec: 106.055
INFO:tensorflow:loss = 72.64976, step = 32801 (0.944 sec)
INFO:tensorflow:global_step/sec: 107.6
INFO:tensorflow:loss = 76.09609, step = 32901 (0.928 sec)
INFO:tensorflow:global_step/sec: 104.373
INFO:tensorflow:loss = 78.681564, step = 33001 (0.960 sec)
INFO:tensorflow:global_step/sec: 100.768
INFO:tensorflow:loss = 83.80344, step = 33101 (0.991 sec)
INFO:tensorflow:global_step/sec: 108.118
INFO:tensorflow:loss = 79.48906, step = 33201 (0.926 sec)
INFO:tensorflow:global_step/sec: 110.387
INFO:tensorflow:loss = 81.38893, step = 33301 (0.905 sec)
INFO:tensorflow:global_step/sec: 110.35
INFO:tensorflow:loss = 72.837906, step = 33401 (0.906 sec)
INFO:tensorflow:global_step/sec: 106.892
INFO:tensorflow:loss = 80.84316, step = 33501 (0.936 sec)
INFO:tensorflow:global_step/sec: 110.26
INFO:tensorflow:loss = 78.02931, step = 33601 (0.907 sec)
INFO:tensorflow:global_step/sec: 108.042
INFO:tensorflow:loss = 74.8795, step = 33701 (0.925 sec)
INFO:tensorfl

## EVAL model accuracy

In [None]:
print("Test/input data shape:",X_test.shape)
print("Test/labels data shape:",y_test.shape)

In [None]:
# the dataset is unbalanced. There are much more records with label 0 (not clicked) than 1
y_test.hist()

In [None]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
            x=X_test,
            y=y_test['label'],
            batch_size=400,
            num_epochs=4,
            shuffle=False,
            queue_capacity=1000,
            num_threads=1,
            target_column='label')

In [None]:
estimator.evaluate(input_fn=eval_input_fn)

In [None]:
predictions = estimator.predict(eval_input_fn)

In [None]:
for i, pred in enumerate(predictions):
    print(pred['class_ids'], y_test.values[i])