# Neural Network Model

The aim of the notebook is demo end to end pipeline for Ads prediction in Tensorflow

In [1]:
! ./setup.sh

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
import tensorflow as tf
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import sys
from pathlib import Path
import datetime
import time
import numpy as np
import pandas as pd
from pprint import pprint
from typing import Dict, Any, Union, List
from functools import partial
import re
import string
from sklearn.preprocessing import MultiLabelBinarizer
from math import ceil
from collections import namedtuple

print(f"Using Tensorflow, {tf.__version__} on Python interpreter, {sys.version_info}")

Using Tensorflow, 2.1.0 on Python interpreter, sys.version_info(major=3, minor=7, micro=5, releaselevel='final', serial=0)


In [3]:
RANDOM_SEED = int(time.time())

tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"Using random seed, {RANDOM_SEED}")

Using random seed, 1588306442


## Load Data

Dataset credits:
```
@inproceedings{roffo2016personality,
  title={Personality in computational advertising: A benchmark},
  author={Roffo, Giorgio and Vinciarelli, Alessandro},
  booktitle={4 th Workshop on Emotions and Personality in Personalized Systems (EMPIRE) 2016},
  pages={18},
  year={2016}
}
```

In [4]:
DATA_FOLDER = Path("../../dataset/")
BATCH_SIZE = 4096 # bigger the batch, faster the training but bigger the RAM needed
TARGET_COL = "Rating"

# data files path are relative DATA_FOLDER
users_ads_rating_csv = DATA_FOLDER/"users-ads-without-gcp-ratings_OHE_MLB.csv"

In [5]:
USER_ID = "UserId"
AD_ID = "AdId"
AGE = "Age"
ZIP_CODE = "CapZipCode"
COUNTRIES_VISITED = "Countriesvisited"
FAVE_SPORTS = "FaveSports"
GENDER = "Gender"
HOME_COUNTRY = "Homecountry"
HOME_TOWN = "Hometown"
INCOME = "Income"
LAST_NAME = "LastName"
MOST_LISTENED_MUSICS = "Mostlistenedmusics"
MOST_READ_BOOKS = "Mostreadbooks"
MOST_VISITED_WEBSITES = "Mostvisitedwebsites"
MOST_WATCHED_MOVIES = "Mostwatchedmovies"
MOST_WATCHED_TV_PROGRAMMES = "Mostwatchedtvprogrammes"
NAME = "Name"
PAYPAL = "Paypal"
TIMEPASS = "Timepass"
TYPE_OF_JOB = "TypeofJob"
WEEKLY_WORKING_HOURS = "Weeklyworkinghours"
FAVE1 = "fave1"
FAVE10 = "fave10"
FAVE2 = "fave2"
FAVE3 = "fave3"
FAVE4 = "fave4"
FAVE5 = "fave5"
FAVE6 = "fave6"
FAVE7 = "fave7"
FAVE8 = "fave8"
FAVE9 = "fave9"
UNFAVE1 = "unfave1"
UNFAVE2 = "unfave2"
UNFAVE3 = "unfave3"
UNFAVE4 = "unfave4"
UNFAVE5 = "unfave5"
UNFAVE6 = "unfave6"
ADFILEPATH = "AdFilePath"
GENDER_F = "Gender_F"
GENDER_M = "Gender_M"
RATING = "Rating"
AD_NUM_FACES = "ad_num_faces"



# Read all columns as strings to avoid any errors
COL_DEFAULTS = {
    USER_ID: "**",
    AD_ID: "**",
    AGE: "**",
    ZIP_CODE: "**",
    COUNTRIES_VISITED: "**",
    FAVE_SPORTS: "**",
    GENDER: "**",
    HOME_COUNTRY: "**",
    HOME_TOWN: "**",
    INCOME: "**",
    LAST_NAME: "**",
    MOST_LISTENED_MUSICS: "**",
    MOST_READ_BOOKS: "**",
    MOST_VISITED_WEBSITES: "**",
    MOST_WATCHED_MOVIES: "**",
    MOST_WATCHED_TV_PROGRAMMES: "**",
    NAME: "**",
    PAYPAL: "**",
    TIMEPASS: "**",
    TYPE_OF_JOB: "**",
    WEEKLY_WORKING_HOURS: "**",
    FAVE1: "**",
    FAVE10: "**",
    FAVE2: "**",
    FAVE3: "**",
    FAVE4: "**",
    FAVE5: "**",
    FAVE6: "**",
    FAVE7: "**",
    FAVE8: "**",
    FAVE9: "**",
    UNFAVE1: "**",
    UNFAVE2: "**",
    UNFAVE3: "**",
    UNFAVE4: "**",
    UNFAVE5: "**",
    UNFAVE6: "**",
    ADFILEPATH: "**",
    GENDER_F: "**",
    GENDER_M: "**",
    RATING: "**",
    AD_NUM_FACES: "**"
}

# SELECTED_COLS = [AGE, ZIP_CODE, FAVE_SPORTS, GENDER, HOME_COUNTRY, HOME_TOWN, INCOME, MOST_LISTENED_MUSICS, MOST_READ_BOOKS, 
#                  MOST_VISITED_WEBSITES, MOST_WATCHED_MOVIES, MOST_WATCHED_TV_PROGRAMMES, TIMEPASS, TYPE_OF_JOB, WEEKLY_WORKING_HOURS, 
#                  FAVE1, FAVE2, FAVE3, FAVE4, FAVE5, FAVE6, FAVE7, FAVE8, FAVE9, FAVE10, UNFAVE1, UNFAVE2, UNFAVE3, UNFAVE4, UNFAVE5, 
#                  UNFAVE6, RATING]

AD_FACE_COLS = [AD_NUM_FACES]
AD_LABEL_COLS = []
AD_OBJECT_COLS = []
AD_SAFE_SEARCH_COLS = []


SELECTED_AD_COLS = AD_FACE_COLS + AD_LABEL_COLS + AD_OBJECT_COLS + AD_SAFE_SEARCH_COLS


SELECTED_INP_COLS = [AGE, ZIP_CODE, FAVE_SPORTS, GENDER_F, GENDER_M] + SELECTED_AD_COLS
SELECTED_COLS = SELECTED_INP_COLS + [TARGET_COL]

SELECTED_COLS

['Age',
 'CapZipCode',
 'FaveSports',
 'Gender_F',
 'Gender_M',
 'ad_num_faces',
 'Rating']

In [6]:
def ad_dataset_pd():
    return pd.read_csv(users_ads_rating_csv, usecols=SELECTED_COLS, dtype=str)

In [7]:
ad_dataset_pd().sample(10)

Unnamed: 0,Age,CapZipCode,FaveSports,Rating,Gender_F,Gender_M,ad_num_faces
25018,27,CM20 1AY,"Indoor sports, Olympic sports‎",5.0,1,0,0
21370,52,LE9 6RG,"Individual sports‎ (Tennis, Archery, ...)",3.0,1,0,0
35172,18,94401-2727,I do not like Sports,4.0,1,0,9
5649,55,98684-9440,"Individual sports‎ (Tennis, Archery, ...)",3.0,0,1,0
19245,49,H4N 2Z1,"Team sports (Footbal, Baseball, Rugby, ...)",2.0,0,1,0
20877,31,R2V 3X9,Winter sports,1.0,1,0,0
6315,27,90650,"Team sports (Footbal, Baseball, Rugby, ...)",1.0,1,0,0
3873,18,2380,Endurance sports,1.0,0,1,0
4888,20,2494,I do not like Sports,2.0,1,0,0
14472,30,77566,"Olympic sports‎, Precision sports‎ (Golf, Bowl...",3.0,1,0,9


## Transform Data

In [8]:
def dict_project(d:Dict, cols:List[str]) -> Dict:
    return {k:v for k, v in d.items() if k in cols}

In [9]:
class IndexerForVocab:
    def __init__(self, vocab_list:List[str], oov_index:int=0):
        """
        Creates a string indexer for the vocabulary with out of vocabulary (oov) indexing
        """
        self._vocab_map = {v:i+1 for i, v in enumerate(vocab_list)}
        self._oov = oov_index
        
    def __repr__(self):
        return f"Map for {len(self)} keys with 1 OOV key"
    
    def __len__(self):
        return len(self._vocab_map) + 1
        
    def index_of(self, item:str):
        """
        Index of item in the vocabulary
        """
        return self._vocab_map.get(item, self._oov)
    
    def index_of_mux(self, items:List[str]):
        return [self.index_of(i) for i in items]

### Age

Convert to a number and remove any outliers

In [10]:
# Obtained from Tensorflow Data Validation APIs data-exploration/tensorflow-data-validation.ipynb

MEAN_AGE, STD_AGE, MEDIAN_AGE, MAX_AGE = 31.74, 12.07, 29, 140

In [11]:
def fix_age(age_str:tf.string, default_age=MEDIAN_AGE) -> int:
    """Typecast age to an integer and update outliers with the default"""
    try:
        age = int(age_str)
        if age < 0 or age > MAX_AGE:
            raise ValueError(f"{age} is not a valid age")
    except:
        age = default_age
    normalized_age = (age - MEAN_AGE) / STD_AGE
    return normalized_age

#### Visual Tests

In [12]:
fix_age("50"), fix_age("50.5"), fix_age("-10"), fix_age("bad_age_10"), fix_age("300")

(1.5128417564208783,
 -0.22700911350455663,
 -0.22700911350455663,
 -0.22700911350455663,
 -0.22700911350455663)

### Zip Code

Prepare zip-code column for one-hot encoding each character

In [13]:
DEFAULT_ZIP_CODE, FIRST_K_ZIP_DIGITS = "00000", 2

zip_code_indexer = IndexerForVocab(string.digits + string.ascii_lowercase + string.ascii_uppercase)

In [14]:
def fix_zip_code_tensor(zip_code:tf.string, n_digits, indexer) -> List[str]:
    """Extracts the the first n_digits as a list"""
    zip_digits = []
    try:
        if isinstance(zip_code, tf.Tensor):
            zip_code = zip_code.numpy()[0].decode('ascii', errors="ignore") # very ineffecient way
        zip_digits = list(zip_code.strip()[:n_digits])
    except:
        zip_digits = list(DEFAULT_ZIP_CODE[:n_digits])
    return tf.concat( [
        tf.one_hot(
            indexer.index_of(d), len(indexer)
        ) for d in zip_digits
    ], 0 )

def fix_zip_code(zip_code:str, n_digits, indexer) -> List[str]:
    """Extracts the the first n_digits as a list"""
    zip_digits = []
    try:
        zip_digits = list(zip_code.strip()[:n_digits])
    except:
        zip_digits = list(DEFAULT_ZIP_CODE[:n_digits])
    return np.ravel(np.eye(len(indexer))[indexer.index_of_mux(zip_digits)])

#### Visual Tests

In [15]:
test_zip_code_indexer = IndexerForVocab(string.digits)

(fix_zip_code("43556", 10, test_zip_code_indexer),
fix_zip_code("43556", 2, test_zip_code_indexer),
fix_zip_code("43556", 4, test_zip_code_indexer),
fix_zip_code(None, 3, test_zip_code_indexer))

(array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

### Favorite Sports

Two approaches,
1. Consider the first `K` sports mentioned by each user and one-hot encode each separately
2. Multi label binarize all the sports as there are only 15 unique sports

In [16]:
FAV_SPORTS_UNKNOWN = "UNK_SPORT"
ALL_FAV_SPORTS = ['Olympic sports', 'Winter sports', 'Nothing', 'I do not like Sports', 'Equestrian sports', 'Skating sports', 'Precision sports', 'Hunting sports', 'Motor sports', 'Team sports', 'Individual sports', 'Other', 'Water sports', 'Indoor sports', 'Endurance sports']

fav_sports_binarizer = MultiLabelBinarizer()
fav_sports_binarizer.fit([ALL_FAV_SPORTS])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [17]:
def fav_sports_multi_select_str_to_list(sports_str:Union[str, tf.Tensor]) -> List[str]:
    # remove commas that dont separate different user selections
    # example, commas inside paranthesis of "Individual sports (Tennis, Archery, ...)" dont make new sports
    if isinstance(sports_str, tf.Tensor):
        sports_str = sports_str.numpy()[0].decode('ascii', errors="ignore")
    else:
        sports_str = sports_str.encode("ascii", errors="ignore").decode("ascii") # remove non-ascii chars
    sports = re.sub(r"\s*\(.*,.*\)\s*", "", sports_str)
    return re.split(r"\s*,\s*", sports)

def fix_fav_sports_mlb(sports_str:str) -> List[int]:
    sports = fav_sports_multi_select_str_to_list(sports_str)
    return fav_sports_binarizer.transform([sports])[0]

def fix_fav_sports_firstk(sports_str:str, first_k:int, pad_constant:int) -> List[str]:
    sports = fav_sports_multi_select_str_to_list(sports_str)
    right_pad_width = first_k - len(sports_enc)
    result = [sports + [pad_constant] * right_pad_width][:first_k]
    return result

#### Visual Tests

In [18]:
(
    fix_fav_sports_mlb("Individual sports (Tennis, Archery, ...), Indoor sports, Endurance sports, Skating sports"),
    fix_fav_sports_mlb("Skating sports"),
    fix_fav_sports_mlb("Individual sports (Tennis, Archery, ...)"),
    fix_fav_sports_mlb("Indoor sports, Endurance sports, Skating sports"),
)

(array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]))

### Target

In [19]:
RATINGS_CARDINALITY = 5 # not zero based indexing i.e. ratings range from 1 to 5

In [20]:
def create_target_pd(rating_str:str):
    return np.eye(RATINGS_CARDINALITY, dtype=int)[int(float(rating_str)) - 1]

## Featurize

In [21]:
def transform_pd_X(df:pd.DataFrame, inp_cols:List[str]):
    """Original dataframe will be modified"""
    df[AGE] = df[AGE].apply(lambda age: [fix_age(age)])
    df[ZIP_CODE] = df[ZIP_CODE].apply(lambda zc: fix_zip_code(zc, n_digits=2, indexer=zip_code_indexer))
    df[FAVE_SPORTS] = df[FAVE_SPORTS].apply(fix_fav_sports_mlb)
    df[GENDER_F] = df[GENDER_F].apply(lambda gender_f: [int(gender_f)])
    df[GENDER_M] = df[GENDER_M].apply(lambda gender_m: [int(gender_m)])
    df[AD_NUM_FACES] = df[AD_NUM_FACES].apply(lambda ad_num_faces: [int(ad_num_faces)])
    df["X"] = df[inp_cols].apply(np.concatenate, axis=1)
    # TODO: vectorize, else inefficient to sequentially loop over all example
    X = np.array([x for x in df["X"]])
    return X

In [22]:
def transform_pd_y(df:pd.DataFrame, target_col:str):
    """Original dataframe will be modified"""
    df["y"] = df[target_col].apply(create_target_pd)
    # TODO: vectorize, else inefficient to sequentially loop over all example
    y = np.array([y for y in df["y"]])
    return y

In [23]:
def create_dataset_pd(inp_cols:List[str]=SELECTED_INP_COLS, target_col:str=TARGET_COL, fraction:float=1) -> pd.DataFrame:
    """Prepare the dataset for training on a fraction of all input data"""
    df = ad_dataset_pd().sample(frac=fraction)
    return transform_pd_X(df, inp_cols), transform_pd_y(df, target_col)

## Tensorboard

Monitor training and other stats

In [24]:
from tensorboard import notebook

In [25]:
%reload_ext tensorboard

Start tensorboard

In [26]:
%tensorboard --logdir logs --port 6006

ERROR: Failed to launch TensorBoard (exited with 255).
Contents of stderr:
W0430 21:14:07.963954 123145362190336 plugin_event_accumulator.py:294] Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
E0430 21:14:08.979719 4556328384 program.py:288] TensorBoard could not bind to port 6006, it was already in use
ERROR: TensorBoard could not bind to port 6006, it was already in use

In [27]:
notebook.list()

No known TensorBoard instances running.


## Model

Create a model and train using high level APIs like `tf.keras` and `tf.estimator`

In [28]:
%%time

# train_dataset = input_fn_train(BATCH_SIZE)
X, y = create_dataset_pd()

CPU times: user 12.4 s, sys: 266 ms, total: 12.6 s
Wall time: 13 s


In [55]:
# tf.keras.metrics.SensitivityAtSpecificity(name="ss")  # For false positive rate

keras_model_metrics = [
    "accuracy",
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.TrueNegatives(name='tn'),
    tf.keras.metrics.FalseNegatives(name='fn'), 
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.AUC(name='auc')
]
train_histories = []

In [None]:
# DON'T CHANGE THE EPOCHS VALUE
BATCH_SIZE = 4096
EPOCHS = 1000

In [None]:
logdir = Path("logs")/datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    logdir, 
    histogram_freq=max(1, ceil(EPOCHS / 20)), # to control the amount of logging
#     embeddings_freq=epochs,
)
print(f"Logging tensorboard data at {logdir}")

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(20, input_shape=(X.shape[1],), activation=tf.keras.layers.LeakyReLU()),
    tf.keras.layers.Dense(RATINGS_CARDINALITY , activation='softmax')
])

model.compile(
    optimizer=tf.optimizers.Adam(
        learning_rate=0.003,
        clipvalue=0.5
    ), 
#     optimizer=tf.keras.optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True), 
#     optimizer=tf.keras.optimizers.RMSprop(lr),
#     loss=tf.nn.softmax_cross_entropy_with_logits,
    loss="categorical_crossentropy",
    metrics=keras_model_metrics
)

model.summary()

In [None]:
%%time

train_histories.append(model.fit(
    X, y,
    BATCH_SIZE,
    epochs=EPOCHS, 
    callbacks=[tensorboard_callback, tfdocs.modeling.EpochDots()],
    validation_split=0.2,
    verbose=0
))

In [51]:
metrics_df = pd.DataFrame(train_histories[-1].history) # pick the latest training history

metrics_df.tail(1) # pick the last epoch's metrics

Unnamed: 0,loss,accuracy,tp,fp,tn,fn,precision,recall,auc,val_loss,val_accuracy,val_tp,val_fp,val_tn,val_fn,val_precision,val_recall,val_auc
999,1.001743,0.616771,14620.0,5769.0,109431.0,14180.0,0.717053,0.507639,0.869949,1.02381,0.613056,3616.0,1479.0,27321.0,3584.0,0.709715,0.502222,0.865014


`Tip:` You can copy the final metrics row from above and paste it using `Shift + Cmd + V` in our [sheet](https://docs.google.com/spreadsheets/d/1v-nYiDA3elM1UP9stkB42MK0bTbuLxYJE7qAYDP8FHw/edit#gid=925421130) to accurately place all values in the respective columns

**IMPORTANT**: Please don't forget to update git version ID column after you check-in.

### Model Metrics with p-value

TODO

## Export

Save the model for future reference

In [None]:
model.save((logdir/"keras_saved_model").as_posix(), save_format="tf")

## Predict

In [None]:
PredictionReport = namedtuple("PredictionReport", "probabilities predicted_rating confidence")

test_df = pd.DataFrame({
    AGE: ["45"],
    ZIP_CODE: ["94086"],
    FAVE_SPORTS: ["I do not like Sports"]
})

probabilities = model.predict(transform_pd_X(test_df, SELECTED_INP_COLS))
predicted_rating, confidence = np.argmax(probabilities), np.max(probabilities)

PredictionReport(probabilities, predicted_rating, confidence)

## Rough

### Featurize using Feature Columns

Create feature columns like one-hot, embeddings, bucketing from raw features created earlier

In [None]:
EXAMPLE_BATCH = next(iter(input_fn_train(3)))[0]

In [None]:
EXAMPLE_BATCH

In [None]:
def test_feature_column(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    return feature_layer(EXAMPLE_BATCH).numpy()

In [None]:
age_fc = tf.feature_column.numeric_column(AGE, normalizer_fn=lambda x: (x - MEAN_AGE) / STD_AGE)

In [None]:
zip_fcs = [
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            f"{ZIP_CODE}{i}", vocabulary_list=list(string.digits), 
            num_oov_buckets=1)
    )
    for i in range(FIRST_K_ZIP_DIGITS)
]

In [None]:
EXAMPLE_BATCH[AGE], test_feature_column(age_fc)

In [None]:
{k: v for k, v in EXAMPLE_BATCH.items() if k.startswith(ZIP_CODE)}, test_feature_column(zip_fcs)

In [None]:
tf.keras.layers.concatenate(age_fc, zip_fcs[0])