# Neural Network Model

The aim of the notebook is demo end to end pipeline for Ads prediction in Tensorflow

In [80]:
import tensorflow as tf
import sys
from pathlib import Path
import datetime
import time
import numpy as np
import pandas as pd
from pprint import pprint
from typing import Dict, Any, Union, List
from functools import partial

print(f"Using Tensorflow, {tf.__version__} on Python interpreter, {sys.version_info}")

Using Tensorflow, 2.1.0 on Python interpreter, sys.version_info(major=3, minor=7, micro=5, releaselevel='final', serial=0)


In [2]:
RANDOM_SEED = int(time.time())

tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"Using random seed, {RANDOM_SEED}")

Using random seed, 1587796063


## Load Data

Dataset credits:
```
@inproceedings{roffo2016personality,
  title={Personality in computational advertising: A benchmark},
  author={Roffo, Giorgio and Vinciarelli, Alessandro},
  booktitle={4 th Workshop on Emotions and Personality in Personalized Systems (EMPIRE) 2016},
  pages={18},
  year={2016}
}
```

In [3]:
DATA_FOLDER = Path("../../dataset/")
BATCH_SIZE = 4096 # bigger the batch, faster the training but bigger the RAM needed
TARGET_COL = "Rating"

# data files path are relative DATA_FOLDER
users_ads_rating_csv = DATA_FOLDER/"AllUsers_Ads_Ratings_df.csv"

In [4]:
USER_ID = "UserId"
AD_ID = "AdId"
AGE = "Age"
ZIP_CODE = "Cap/Zip-Code"
COUNTRIES_VISITED = "Countries visited"
FAVE_SPORTS = "Fave Sports"
GENDER = "Gender"
HOME_COUNTRY = "Home country"
HOME_TOWN = "Home town"
INCOME = "Income"
LAST_NAME = "Last Name"
MOST_LISTENED_MUSICS = "Most listened musics"
MOST_READ_BOOKS = "Most read books"
MOST_VISITED_WEBSITES = "Most visited websites"
MOST_WATCHED_MOVIES = "Most watched movies"
MOST_WATCHED_TV_PROGRAMMES = "Most watched tv programmes"
NAME = "Name"
PAYPAL = "Paypal"
TIMEPASS = "Timepass"
TYPE_OF_JOB = "Type of Job"
WEEKLY_WORKING_HOURS = "Weekly working hours"
FAVE1 = "fave1"
FAVE10 = "fave10"
FAVE2 = "fave2"
FAVE3 = "fave3"
FAVE4 = "fave4"
FAVE5 = "fave5"
FAVE6 = "fave6"
FAVE7 = "fave7"
FAVE8 = "fave8"
FAVE9 = "fave9"
UNFAVE1 = "unfave1"
UNFAVE2 = "unfave2"
UNFAVE3 = "unfave3"
UNFAVE4 = "unfave4"
UNFAVE5 = "unfave5"
UNFAVE6 = "unfave6"
ADFILEPATH = "AdFilePath"
RATING = "Rating"

# Read all columns as strings to avoid any errors
COL_DEFAULTS = {
    USER_ID: "**",
    AD_ID: "**",
    AGE: "**",
    ZIP_CODE: "**",
    COUNTRIES_VISITED: "**",
    FAVE_SPORTS: "**",
    GENDER: "**",
    HOME_COUNTRY: "**",
    HOME_TOWN: "**",
    INCOME: "**",
    LAST_NAME: "**",
    MOST_LISTENED_MUSICS: "**",
    MOST_READ_BOOKS: "**",
    MOST_VISITED_WEBSITES: "**",
    MOST_WATCHED_MOVIES: "**",
    MOST_WATCHED_TV_PROGRAMMES: "**",
    NAME: "**",
    PAYPAL: "**",
    TIMEPASS: "**",
    TYPE_OF_JOB: "**",
    WEEKLY_WORKING_HOURS: "**",
    FAVE1: "**",
    FAVE10: "**",
    FAVE2: "**",
    FAVE3: "**",
    FAVE4: "**",
    FAVE5: "**",
    FAVE6: "**",
    FAVE7: "**",
    FAVE8: "**",
    FAVE9: "**",
    UNFAVE1: "**",
    UNFAVE2: "**",
    UNFAVE3: "**",
    UNFAVE4: "**",
    UNFAVE5: "**",
    UNFAVE6: "**",
    ADFILEPATH: "**",
    RATING: "**"
}

SELECTED_COLS = [AGE, ZIP_CODE, FAVE_SPORTS, GENDER, HOME_COUNTRY, HOME_TOWN, INCOME, MOST_LISTENED_MUSICS, MOST_READ_BOOKS, 
                 MOST_VISITED_WEBSITES, MOST_WATCHED_MOVIES, MOST_WATCHED_TV_PROGRAMMES, TIMEPASS, TYPE_OF_JOB, WEEKLY_WORKING_HOURS, 
                 FAVE1, FAVE2, FAVE3, FAVE4, FAVE5, FAVE6, FAVE7, FAVE8, FAVE9, FAVE10, UNFAVE1, UNFAVE2, UNFAVE3, UNFAVE4, UNFAVE5, 
                 UNFAVE6, RATING]

In [54]:
def ad_dataset(batch_size=BATCH_SIZE, shuffle=True):
    return tf.data.experimental.make_csv_dataset(
        users_ads_rating_csv.as_posix(),
        batch_size,
        column_defaults={col:default for col, default in COL_DEFAULTS.items() if col in SELECTED_COLS},
        select_columns=list(SELECTED_COLS),
        label_name=None,
        shuffle=shuffle,
        shuffle_buffer_size=1000,
        shuffle_seed=RANDOM_SEED,
        sloppy=True,
        ignore_errors=False # set true while training if required
    )

In [56]:
for d in ad_dataset(3).take(1):
    pprint(d)

OrderedDict([('Age',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'26', b'22', b'26'], dtype=object)>),
             ('Cap/Zip-Code',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'60638', b'54942', b'60638'], dtype=object)>),
             ('Fave Sports',
              <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'Nothing', b'Team sports (Footbal, Baseball, Rugby, ...) ',
       b'Nothing'], dtype=object)>),
             ('Gender',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'F', b'M', b'F'], dtype=object)>),
             ('Home country',
              <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'United States of America', b'United States of America',
       b'United States of America'], dtype=object)>),
             ('Home town',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'Chicago', b'Greenville', b'Chicago'], dtype=object)>),
             ('Income',
              <tf.Tensor: shape=(

## Transform Data

### Age

Convert to a number and remove any outliers

In [58]:
# Obtained from Tensorflow Data Validation APIs data-exploration/tensorflow-data-validation.ipynb

MEAN_AGE, STD_AGE, MEDIAN_AGE, MAX_AGE = 31.74, 12.07, 29, 140

In [132]:
def fix_age(age_str:tf.string, default_age=MEDIAN_AGE) -> int:
    """Typecast age to an integer and update outliers with the default"""
    try:
        age = int(age_str)
        if age < 0 or age > MAX_AGE:
            raise ValueError(f"{age} is not a valid age")
    except:
        age = default_age
    return age

def fix_age_tf(example:Dict):
    """Wrap in a py_function for TF to run inside its execution graph"""
    example[AGE] = tf.py_function(fix_age, [example[AGE]], tf.int16)
    return example

Visual Tests

In [133]:
fix_age("50"), fix_age("50.5"), fix_age("-10"), fix_age("bad_age_10"), fix_age("300")

(50, 29, 29, 29, 29)

In [134]:
for d in ad_dataset(1, True).map(fix_age_tf).batch(10).take(5):
    pprint(d[AGE])

<tf.Tensor: shape=(10,), dtype=int16, numpy=array([26, 22, 26, 26, 62, 24, 26, 24, 26, 26], dtype=int16)>
<tf.Tensor: shape=(10,), dtype=int16, numpy=array([26, 22, 22, 26, 22, 22, 22, 22, 62, 26], dtype=int16)>
<tf.Tensor: shape=(10,), dtype=int16, numpy=array([24, 22, 26, 22, 62, 26, 22, 62, 26, 62], dtype=int16)>
<tf.Tensor: shape=(10,), dtype=int16, numpy=array([22, 22, 26, 26, 62, 24, 22, 26, 26, 24], dtype=int16)>
<tf.Tensor: shape=(10,), dtype=int16, numpy=array([26, 22, 62, 26, 22, 22, 62, 62, 62, 24], dtype=int16)>


### Zip Code

Prepare zip-code column for one-hot encoding each character

In [84]:
DEFAULT_ZIP_CODE = "00000"

In [162]:
def fix_zip_code(zip_code:tf.string, n_digits) -> List[str]:
    """Extracts the the first n_digits as a list"""
    try:
        zip_code = zip_code.numpy()[0].decode('ascii') # very ineffecient way
        return list(zip_code.strip()[:n_digits])
    except:
        return list(DEFAULT_ZIP_CODE[:n_digits])

def fix_zip_code_tf(example:Dict, n_digits=2):
    """Creates new columns for the first n_digits in zip_code"""
    fix_zip_code_fn = partial(fix_zip_code, n_digits=n_digits)
    zip_digits = tf.py_function(fix_zip_code_fn, [example[ZIP_CODE]], [tf.string] * n_digits)
    zip_features = {}
    for i in range(n_digits):
        val = zip_digits[i]
        val.set_shape([]) # https://github.com/tensorflow/tensorflow/issues/24520#issuecomment-579421744
        zip_features[f"{ZIP_CODE}{i}"] = val
    example.update(zip_features)
    example.pop(ZIP_CODE)
    return example

Visual Tests

In [163]:
(fix_zip_code(tf.constant([b"43556"], shape=(1,), dtype=tf.string), 10),
fix_zip_code(tf.constant([b"43556"], shape=(1,), dtype=tf.string), 2),
fix_zip_code(tf.constant([b"43556"], shape=(1,), dtype=tf.string), 4),
fix_zip_code(tf.constant([43556], shape=(1,), dtype=tf.int32), 4),\
fix_zip_code(None, 3))

(['4', '3', '5', '5', '6'],
 ['4', '3'],
 ['4', '3', '5', '5'],
 ['0', '0', '0', '0'],
 ['0', '0', '0'])

In [165]:
for d in ad_dataset(1, True).map(fix_zip_code_tf).batch(5).take(3):
    pprint({k: v for k, v in d.items() if k.startswith(ZIP_CODE)})

{'Cap/Zip-Code0': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'6', b'5', b'6', b'6', b'1'], dtype=object)>,
 'Cap/Zip-Code1': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'0', b'4', b'0', b'0', b'5'], dtype=object)>}
{'Cap/Zip-Code0': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'N', b'6', b'N', b'6', b'6'], dtype=object)>,
 'Cap/Zip-Code1': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'W', b'0', b'W', b'0', b'0'], dtype=object)>}
{'Cap/Zip-Code0': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'6', b'5', b'5', b'6', b'5'], dtype=object)>,
 'Cap/Zip-Code1': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'0', b'4', b'4', b'0', b'4'], dtype=object)>}


## Tensorboard

## Featurize

## Model

## Export