In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split
print(tf.__version__)

1.11.0


Task: predict the whether an ad was clicked on based on 39 anonymized factors

 - Over 45 million training examples, ~5GB

Features: 13 integer columns, followed by 26 columns of 32 bit hashed values

- Labels: Just a 0 or 1 :)

In [2]:
TRAINING_VARS = ['City', 'State', 'Country', 'Age', 'bookTitle', 'bookAuthor', 'yearOfPublication']
TARGET_VAR = ['label']

In [3]:
# load training dataset
data = pd.read_csv("./BookCrossing/data.csv")
print(data.shape)
print(data.isnull().mean())

# (661724, 8)
# City                 0.002543
# State                0.018819
# Country              0.012921
# Age                  0.000000
# bookTitle            0.000000
# bookAuthor           0.000000
# yearOfPublication    0.000000
# label                0.000000
# dtype: float64

(540629, 8)
City                 0.000723
State                0.010460
Country              0.009515
Age                  0.000000
bookTitle            0.000000
bookAuthor           0.000000
yearOfPublication    0.000000
label                0.000000
dtype: float64


In [4]:
data.head()

Unnamed: 0,City,State,Country,Age,bookTitle,bookAuthor,yearOfPublication,label
0,kln,nordrheinwestfalen,germany,27.0,along came a spider alex cross novels,james patterson,1993.0,0.0
1,kln,nordrheinwestfalen,germany,27.0,schlafes bruder,robert schneider,1994.0,0.0
2,kln,nordrheinwestfalen,germany,27.0,der stein der kelten,michael phillips,2001.0,1.0
3,kln,nordrheinwestfalen,germany,27.0,nordermoor,arnaldur indridason,2003.0,1.0
4,kln,nordrheinwestfalen,germany,27.0,nur der tod ist ohne makel,ann granger,2002.0,1.0


In [5]:
print("Unique Cities: {0} Embedding size: {1}".format(len(data.City.unique()), len(data.City.unique())** 0.25))
print("Unique State: {0} Embedding size: {1}".format(len(data.State.unique()), len(data.State.unique())** 0.25))
print("Unique Country: {0} Embedding size: {1}".format(len(data.Country.unique()), len(data.Country.unique())** 0.25))
print("Unique bookTitle: {0} Embedding size: {1}".format(len(data.bookTitle.unique()), len(data.bookTitle.unique())** 0.25))
print("Unique bookAuthor: {0} Embedding size: {1}".format(len(data.bookAuthor.unique()), len(data.bookAuthor.unique())** 0.25))

Unique Cities: 1465 Embedding size: 6.1867049726012695
Unique State: 294 Embedding size: 4.140824579655874
Unique Country: 58 Embedding size: 2.7596690210718946
Unique bookTitle: 167967 Embedding size: 20.24445041939468
Unique bookAuthor: 69785 Embedding size: 16.253261421281003


In [6]:
data = data.dropna()
print(data.shape)
print(data.isnull().sum())

(533040, 8)
City                 0
State                0
Country              0
Age                  0
bookTitle            0
bookAuthor           0
yearOfPublication    0
label                0
dtype: int64


In [7]:
data.yearOfPublication = data.yearOfPublication.astype('int64')
data.Age = data.Age.astype('int64')

In [8]:
data.head()

Unnamed: 0,City,State,Country,Age,bookTitle,bookAuthor,yearOfPublication,label
0,kln,nordrheinwestfalen,germany,27,along came a spider alex cross novels,james patterson,1993,0.0
1,kln,nordrheinwestfalen,germany,27,schlafes bruder,robert schneider,1994,0.0
2,kln,nordrheinwestfalen,germany,27,der stein der kelten,michael phillips,2001,1.0
3,kln,nordrheinwestfalen,germany,27,nordermoor,arnaldur indridason,2003,1.0
4,kln,nordrheinwestfalen,germany,27,nur der tod ist ohne makel,ann granger,2002,1.0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(data[TRAINING_VARS], data[TARGET_VAR], test_size=0.10, random_state=42)

In [10]:
print(X_train.shape)
print(y_train.shape)

(479736, 7)
(479736, 1)


In [11]:
country_categorical = tf.feature_column.categorical_column_with_vocabulary_list(key="Country",vocabulary_list=data.Country.unique())

In [12]:
city_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="City", hash_bucket_size=400)
state_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="State", hash_bucket_size=200) 
# country_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="Country", hash_bucket_size=100)
bookTitle_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="bookTitle", hash_bucket_size=1000) 
bookAuthor_categorical = tf.feature_column.categorical_column_with_hash_bucket(key="bookAuthor", hash_bucket_size=800) 

In [13]:
# Defaults to a tf.float32 scalar.
yearOfPublication_numeric = tf.feature_column.numeric_column(key="yearOfPublication")
age_numeric = tf.feature_column.numeric_column(key="Age")

In [14]:
categorical_columns = []
categorical_columns.append(tf.feature_column.bucketized_column(source_column=yearOfPublication_numeric, boundaries=[1980,1990,1995,2000,2005]))
categorical_columns.append(tf.feature_column.bucketized_column(source_column=age_numeric, boundaries=[15,20,25,30,35,40,45,50,55]))

In [15]:
embedding_columns = []
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=city_categorical, dimension=8))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=state_categorical, dimension=8))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=country_categorical, dimension=8))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=bookTitle_categorical, dimension=20))
embedding_columns.append(tf.feature_column.embedding_column(categorical_column=bookAuthor_categorical, dimension=16))

In [16]:
## Feature crossing
# categorical_columns[1]: Age categorical
# Age="16" AND bookTitle="Harry Potter"
crossed_features = []
crossed_features.append(tf.feature_column.crossed_column([categorical_columns[1], "bookTitle"], 
                                                         hash_bucket_size=int(1e4)))

# Age="16" AND bookAuthor="JK Rowling"
crossed_features.append(tf.feature_column.crossed_column([categorical_columns[1], "bookAuthor"], 
                                                         hash_bucket_size=int(1e4)))

# Country="United States" AND bookTitle="Homo Deus"
crossed_features.append(tf.feature_column.crossed_column(["Country", "bookTitle"], 
                                                         hash_bucket_size=int(1e4)))

In [17]:
# Embeddings for wide columns into deep columns
for col in categorical_columns:
    embedding_columns.append(tf.feature_column.embedding_column(col, dimension=8))

In [18]:
def train_input_fn(features, labels, batch_size=512):
    # Create the dataset object and return it
    # Make sure you shuffle and define the batch size as given by the 'batch_size' parameter
    
    #   dict(features) -->
    #   {"clicked": [0,1,0,0,0,0...],
    #    "I1": [1,2,0,0,0...],
    #    "I2": [0,499,4,38,1],
    #    ...}
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.repeat(200)
    dataset = dataset.batch(batch_size)
    return dataset # return the dataset object

In [19]:
# train_input_fn = tf.estimator.inputs.pandas_input_fn(
#             x=X_train,
#             y=y_train['clicked'],
#             batch_size=64,
#             num_epochs=2,
#             shuffle=True,
#             queue_capacity=1000,
#             num_threads=1,
#             target_column='clicked')
# #map(column_name => [Tensor of values]) , [Tensor of labels])

In [20]:
# Data transformations
# crossed_1 = tf.contrib.layers.crossed_column([wide_columns[0], wide_columns[1]], 
#                                                          hash_bucket_size=int(1e4))
# wide_columns.append(crossed_1)

In [21]:
pid = os.getpid()
estimator = tf.estimator.DNNLinearCombinedClassifier(
    model_dir="./checkpoints/" + str(pid),
    # wide settings
    linear_feature_columns=categorical_columns + crossed_features,
    linear_optimizer=tf.train.FtrlOptimizer(learning_rate=0.01),
    # deep settings
    dnn_feature_columns=embedding_columns,
    dnn_hidden_units=[512, 256, 512, 128],
    dnn_dropout=0.2,
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.01),
    dnn_activation_fn=tf.nn.relu)


# estimator = tf.estimator.DNNClassifier(
#     model_dir="./checkpoints/" + str(pid),
#     feature_columns=categorical_columns + embedding_columns,
#     hidden_units=[1024, 512, 1024, 256],
#     n_classes=2,
#     optimizer=tf.train.ProximalAdagradOptimizer(
#       learning_rate=0.1,
#     ))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './checkpoints/12703', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f181a1d27b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [22]:
# LOCAL. Update these paths as appropriate
# train_file = "./criteo/train_medium.csv"
# eval_file  = "./criteo/eval_small.csv"

In [None]:
print("Starting experiment:", pid)
estimator.train(input_fn=lambda: train_input_fn(X_train, y_train, batch_size=384))

Starting experiment: 12703
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./checkpoints/12703/model.ckpt.
INFO:tensorflow:loss = 264.64722, step = 1
INFO:tensorflow:global_step/sec: 41.6589
INFO:tensorflow:loss = 223.91472, step = 101 (2.401 sec)
INFO:tensorflow:global_step/sec: 61.5054
INFO:tensorflow:loss = 216.45961, step = 201 (1.626 sec)
INFO:tensorflow:global_step/sec: 63.7504
INFO:tensorflow:loss = 214.17639, step = 301 (1.570 sec)
INFO:tensorflow:global_step/sec: 71.0608
INFO:tensorflow:loss = 209.90642, step = 401 (1.406 sec)
INFO:tensorflow:global_step/sec: 65.0546
INFO:tensorflow:loss = 204.24985, step = 501 (1.537 sec)
INFO:tensorflow:global_step/sec: 61.2759
INFO:tensorflow:loss = 222.559, step = 601 (1.632 sec)
INFO:tensorflow:global_step/s

INFO:tensorflow:loss = 181.42781, step = 8001 (1.480 sec)
INFO:tensorflow:global_step/sec: 66.0482
INFO:tensorflow:loss = 177.85394, step = 8101 (1.515 sec)
INFO:tensorflow:global_step/sec: 67.9424
INFO:tensorflow:loss = 181.42221, step = 8201 (1.471 sec)
INFO:tensorflow:global_step/sec: 70.9895
INFO:tensorflow:loss = 192.68484, step = 8301 (1.409 sec)
INFO:tensorflow:global_step/sec: 69.6098
INFO:tensorflow:loss = 201.64249, step = 8401 (1.437 sec)
INFO:tensorflow:global_step/sec: 73.3552
INFO:tensorflow:loss = 176.45496, step = 8501 (1.363 sec)
INFO:tensorflow:global_step/sec: 72.3691
INFO:tensorflow:loss = 171.27686, step = 8601 (1.383 sec)
INFO:tensorflow:global_step/sec: 73.2134
INFO:tensorflow:loss = 197.34444, step = 8701 (1.365 sec)
INFO:tensorflow:global_step/sec: 71.9036
INFO:tensorflow:loss = 173.5412, step = 8801 (1.391 sec)
INFO:tensorflow:global_step/sec: 73.139
INFO:tensorflow:loss = 155.49841, step = 8901 (1.367 sec)
INFO:tensorflow:global_step/sec: 74.1216
INFO:tensorf

INFO:tensorflow:global_step/sec: 73.6175
INFO:tensorflow:loss = 190.02824, step = 16301 (1.358 sec)
INFO:tensorflow:global_step/sec: 74.1528
INFO:tensorflow:loss = 173.3304, step = 16401 (1.349 sec)
INFO:tensorflow:global_step/sec: 73.0923
INFO:tensorflow:loss = 154.19324, step = 16501 (1.368 sec)
INFO:tensorflow:global_step/sec: 74.4712
INFO:tensorflow:loss = 174.89001, step = 16601 (1.343 sec)
INFO:tensorflow:global_step/sec: 72.1427
INFO:tensorflow:loss = 174.10477, step = 16701 (1.388 sec)
INFO:tensorflow:global_step/sec: 74.0236
INFO:tensorflow:loss = 184.85428, step = 16801 (1.349 sec)
INFO:tensorflow:global_step/sec: 72.7095
INFO:tensorflow:loss = 167.66049, step = 16901 (1.377 sec)
INFO:tensorflow:global_step/sec: 71.4232
INFO:tensorflow:loss = 167.4489, step = 17001 (1.401 sec)
INFO:tensorflow:global_step/sec: 67.6383
INFO:tensorflow:loss = 185.21239, step = 17101 (1.477 sec)
INFO:tensorflow:global_step/sec: 68.1877
INFO:tensorflow:loss = 167.1486, step = 17201 (1.468 sec)
INF

INFO:tensorflow:loss = 188.23676, step = 24501 (1.880 sec)
INFO:tensorflow:global_step/sec: 59.6268
INFO:tensorflow:loss = 157.28645, step = 24601 (1.679 sec)
INFO:tensorflow:global_step/sec: 56.3313
INFO:tensorflow:loss = 153.76877, step = 24701 (1.774 sec)
INFO:tensorflow:global_step/sec: 66.2658
INFO:tensorflow:loss = 183.72702, step = 24801 (1.509 sec)
INFO:tensorflow:global_step/sec: 73.2377
INFO:tensorflow:loss = 170.2673, step = 24901 (1.365 sec)
INFO:tensorflow:global_step/sec: 74.4015
INFO:tensorflow:loss = 157.79291, step = 25001 (1.346 sec)
INFO:tensorflow:global_step/sec: 73.2376
INFO:tensorflow:loss = 158.50476, step = 25101 (1.364 sec)
INFO:tensorflow:global_step/sec: 73.6694
INFO:tensorflow:loss = 161.6756, step = 25201 (1.358 sec)
INFO:tensorflow:global_step/sec: 73.7054
INFO:tensorflow:loss = 147.34143, step = 25301 (1.357 sec)
INFO:tensorflow:global_step/sec: 74.06
INFO:tensorflow:loss = 156.42407, step = 25401 (1.350 sec)
INFO:tensorflow:global_step/sec: 74.577
INFO:

INFO:tensorflow:global_step/sec: 53.4639
INFO:tensorflow:loss = 165.84575, step = 32801 (1.869 sec)
INFO:tensorflow:global_step/sec: 53.1188
INFO:tensorflow:loss = 168.64703, step = 32901 (1.883 sec)
INFO:tensorflow:global_step/sec: 53.5895
INFO:tensorflow:loss = 180.40106, step = 33001 (1.866 sec)
INFO:tensorflow:global_step/sec: 53.3477
INFO:tensorflow:loss = 163.4885, step = 33101 (1.875 sec)
INFO:tensorflow:global_step/sec: 53.8089
INFO:tensorflow:loss = 161.42368, step = 33201 (1.858 sec)
INFO:tensorflow:global_step/sec: 53.7111
INFO:tensorflow:loss = 161.82278, step = 33301 (1.862 sec)
INFO:tensorflow:global_step/sec: 47.7456
INFO:tensorflow:loss = 152.55136, step = 33401 (2.095 sec)
INFO:tensorflow:global_step/sec: 48.528
INFO:tensorflow:loss = 163.70418, step = 33501 (2.060 sec)
INFO:tensorflow:global_step/sec: 51.3381
INFO:tensorflow:loss = 158.18079, step = 33601 (1.948 sec)
INFO:tensorflow:global_step/sec: 49.9208
INFO:tensorflow:loss = 168.22638, step = 33701 (2.005 sec)
IN

## EVAL model accuracy

In [None]:
print("Test/input data shape:",X_test.shape)
print("Test/labels data shape:",y_test.shape)

In [None]:
# the dataset is unbalanced. There are much more records with label 0 (not clicked) than 1
y_test.hist()

In [None]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
            x=X_test,
            y=y_test['label'],
            batch_size=400,
            num_epochs=4,
            shuffle=True,
            queue_capacity=1000,
            num_threads=1,
            target_column='label')

In [None]:
estimator.evaluate(input_fn=lambda: train_input_fn(X_train[0:20000], y_train[0:20000], batch_size=1024))

In [None]:
predictions = estimator.predict(eval_input_fn)

In [None]:
for i, pred in enumerate(predictions):
    print(pred['class_ids'], y_train[0:20000].values[i])