# Neural Network Model

The aim of the notebook is demo end to end pipeline for Ads prediction in Tensorflow

In [2]:
! ./setup.sh

In [1]:
import tensorflow as tf
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import sys
from pathlib import Path
import datetime
import time
import numpy as np
import pandas as pd
from pprint import pprint
from typing import Dict, Any, Union, List
from functools import partial
import re
import string
from sklearn.preprocessing import MultiLabelBinarizer
from math import ceil
from collections import namedtuple

print(f"Using Tensorflow, {tf.__version__} on Python interpreter, {sys.version_info}")

Using Tensorflow, 2.1.0 on Python interpreter, sys.version_info(major=3, minor=6, micro=1, releaselevel='final', serial=0)


In [2]:
RANDOM_SEED = int(time.time())

tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"Using random seed, {RANDOM_SEED}")

Using random seed, 1588211768


## Load Data

Dataset credits:
```
@inproceedings{roffo2016personality,
  title={Personality in computational advertising: A benchmark},
  author={Roffo, Giorgio and Vinciarelli, Alessandro},
  booktitle={4 th Workshop on Emotions and Personality in Personalized Systems (EMPIRE) 2016},
  pages={18},
  year={2016}
}
```

In [3]:
DATA_FOLDER = Path("../../dataset/")
BATCH_SIZE = 4096 # bigger the batch, faster the training but bigger the RAM needed
TARGET_COL = "Rating"

# data files path are relative DATA_FOLDER
users_ads_rating_csv = DATA_FOLDER/"users-ads-without-gcp-ratings_OHE_MLB.csv"

In [4]:
USER_ID = "UserId"
AD_ID = "AdId"
AGE = "Age"
ZIP_CODE = "CapZipCode"
COUNTRIES_VISITED = "Countriesvisited"
FAVE_SPORTS = "FaveSports"
GENDER = "Gender"
HOME_COUNTRY = "Homecountry"
HOME_TOWN = "Hometown"
INCOME = "Income"
LAST_NAME = "LastName"
MOST_LISTENED_MUSICS = "Mostlistenedmusics"
MOST_READ_BOOKS = "Mostreadbooks"
MOST_VISITED_WEBSITES = "Mostvisitedwebsites"
MOST_WATCHED_MOVIES = "Mostwatchedmovies"
MOST_WATCHED_TV_PROGRAMMES = "Mostwatchedtvprogrammes"
NAME = "Name"
PAYPAL = "Paypal"
TIMEPASS = "Timepass"
TYPE_OF_JOB = "TypeofJob"
WEEKLY_WORKING_HOURS = "Weeklyworkinghours"
FAVE1 = "fave1"
FAVE10 = "fave10"
FAVE2 = "fave2"
FAVE3 = "fave3"
FAVE4 = "fave4"
FAVE5 = "fave5"
FAVE6 = "fave6"
FAVE7 = "fave7"
FAVE8 = "fave8"
FAVE9 = "fave9"
UNFAVE1 = "unfave1"
UNFAVE2 = "unfave2"
UNFAVE3 = "unfave3"
UNFAVE4 = "unfave4"
UNFAVE5 = "unfave5"
UNFAVE6 = "unfave6"
ADFILEPATH = "AdFilePath"
GENDER_F = "Gender_F"
GENDER_M = "Gender_M"
RATING = "Rating"

# Read all columns as strings to avoid any errors
COL_DEFAULTS = {
    USER_ID: "**",
    AD_ID: "**",
    AGE: "**",
    ZIP_CODE: "**",
    COUNTRIES_VISITED: "**",
    FAVE_SPORTS: "**",
    GENDER: "**",
    HOME_COUNTRY: "**",
    HOME_TOWN: "**",
    INCOME: "**",
    LAST_NAME: "**",
    MOST_LISTENED_MUSICS: "**",
    MOST_READ_BOOKS: "**",
    MOST_VISITED_WEBSITES: "**",
    MOST_WATCHED_MOVIES: "**",
    MOST_WATCHED_TV_PROGRAMMES: "**",
    NAME: "**",
    PAYPAL: "**",
    TIMEPASS: "**",
    TYPE_OF_JOB: "**",
    WEEKLY_WORKING_HOURS: "**",
    FAVE1: "**",
    FAVE10: "**",
    FAVE2: "**",
    FAVE3: "**",
    FAVE4: "**",
    FAVE5: "**",
    FAVE6: "**",
    FAVE7: "**",
    FAVE8: "**",
    FAVE9: "**",
    UNFAVE1: "**",
    UNFAVE2: "**",
    UNFAVE3: "**",
    UNFAVE4: "**",
    UNFAVE5: "**",
    UNFAVE6: "**",
    ADFILEPATH: "**",
    GENDER_F: "**",
    GENDER_M: "**",
    RATING: "**"
}

# SELECTED_COLS = [AGE, ZIP_CODE, FAVE_SPORTS, GENDER, HOME_COUNTRY, HOME_TOWN, INCOME, MOST_LISTENED_MUSICS, MOST_READ_BOOKS, 
#                  MOST_VISITED_WEBSITES, MOST_WATCHED_MOVIES, MOST_WATCHED_TV_PROGRAMMES, TIMEPASS, TYPE_OF_JOB, WEEKLY_WORKING_HOURS, 
#                  FAVE1, FAVE2, FAVE3, FAVE4, FAVE5, FAVE6, FAVE7, FAVE8, FAVE9, FAVE10, UNFAVE1, UNFAVE2, UNFAVE3, UNFAVE4, UNFAVE5, 
#                  UNFAVE6, RATING]

SELECTED_INP_COLS = [AGE, ZIP_CODE, FAVE_SPORTS, GENDER_F, GENDER_M]
SELECTED_COLS = SELECTED_INP_COLS + [TARGET_COL]

In [5]:
def ad_dataset(batch_size=BATCH_SIZE, shuffle=True):
    return tf.data.experimental.make_csv_dataset(
        users_ads_rating_csv.as_posix(),
        batch_size,
        column_defaults={col:default for col, default in COL_DEFAULTS.items() if col in SELECTED_COLS},
        select_columns=list(SELECTED_COLS),
        label_name=None,
        shuffle=shuffle,
        shuffle_buffer_size=1000,
        shuffle_seed=RANDOM_SEED,
        sloppy=True,
        ignore_errors=False # set true while training if required
    )

In [6]:
for d in ad_dataset(3).take(1):
    pprint(d)

OrderedDict([('Age',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'24', b'26', b'22'], dtype=object)>),
             ('CapZipCode',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'NW1 1EU', b'60638', b'54942'], dtype=object)>),
             ('FaveSports',
              <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'Individual sports\xe2\x80\x8e (Tennis, Archery, ...)',
       b'Nothing', b'Team sports (Footbal, Baseball, Rugby, ...) '],
      dtype=object)>),
             ('Rating',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'1.0', b'1.0', b'5.0'], dtype=object)>),
             ('Gender_F',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'1', b'1', b'0'], dtype=object)>),
             ('Gender_M',
              <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'0', b'0', b'1'], dtype=object)>)])


In [7]:
def ad_dataset_pd():
    return pd.read_csv(users_ads_rating_csv, usecols=SELECTED_COLS, dtype=str)

In [8]:
ad_dataset_pd().sample(3)

Unnamed: 0,Age,CapZipCode,FaveSports,Rating,Gender_F,Gender_M
25582,37,SS2 5JL,"Individual sports‎ (Tennis, Archery, ...)",5.0,0,1
27647,37,33024,Water sports,4.0,1,0
21765,19,J8Y6C9,"Team sports (Footbal, Baseball, Rugby, ...) , ...",1.0,1,0


## Transform Data

In [9]:
def dict_project(d:Dict, cols:List[str]) -> Dict:
    return {k:v for k, v in d.items() if k in cols}

In [10]:
class IndexerForVocab:
    def __init__(self, vocab_list:List[str], oov_index:int=0):
        """
        Creates a string indexer for the vocabulary with out of vocabulary (oov) indexing
        """
        self._vocab_map = {v:i+1 for i, v in enumerate(vocab_list)}
        self._oov = oov_index
        
    def __repr__(self):
        return f"Map for {len(self)} keys with 1 OOV key"
    
    def __len__(self):
        return len(self._vocab_map) + 1
        
    def index_of(self, item:str):
        """
        Index of item in the vocabulary
        """
        return self._vocab_map.get(item, self._oov)
    
    def index_of_mux(self, items:List[str]):
        return [self.index_of(i) for i in items]

### Age

Convert to a number and remove any outliers

In [11]:
# Obtained from Tensorflow Data Validation APIs data-exploration/tensorflow-data-validation.ipynb

MEAN_AGE, STD_AGE, MEDIAN_AGE, MAX_AGE = 31.74, 12.07, 29, 140

In [12]:
def fix_age(age_str:tf.string, default_age=MEDIAN_AGE) -> int:
    """Typecast age to an integer and update outliers with the default"""
    try:
        age = int(age_str)
        if age < 0 or age > MAX_AGE:
            raise ValueError(f"{age} is not a valid age")
    except:
        age = default_age
    normalized_age = (age - MEAN_AGE) / STD_AGE
    return normalized_age

def fix_age_tf(example:Dict, new_col_suffix=""):
    """Wrap in a py_function for TF to run inside its execution graph"""
#     example[AGE + new_col_suffix] = tf.py_function(fix_age, [example[AGE]], (tf.float32, ))
    example[AGE + new_col_suffix] = tf.py_function(fix_age, [example[AGE]], tf.float32)
    example[AGE + new_col_suffix] = tf.expand_dims(example[AGE + new_col_suffix], 0) # https://github.com/tensorflow/tensorflow/issues/24520#issuecomment-579421744
    return example

#### Visual Tests

In [13]:
fix_age("50"), fix_age("50.5"), fix_age("-10"), fix_age("bad_age_10"), fix_age("300")

(1.5128417564208783,
 -0.22700911350455663,
 -0.22700911350455663,
 -0.22700911350455663,
 -0.22700911350455663)

In [14]:
fix_age_tf_fn = partial(fix_age_tf, new_col_suffix="_encoded")
for d in ad_dataset(1, True).map(fix_age_tf_fn).batch(3).take(5):
    pprint(dict_project(d, [AGE, AGE + "_encoded"]))
    print()

{'Age': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b'24'],
       [b'26'],
       [b'22']], dtype=object)>,
 'Age_encoded': <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[-0.6412593 ],
       [-0.47555923],
       [-0.8069594 ]], dtype=float32)>}

{'Age': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b'22'],
       [b'62'],
       [b'62']], dtype=object)>,
 'Age_encoded': <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[-0.8069594],
       [ 2.5070422],
       [ 2.5070422]], dtype=float32)>}

{'Age': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b'26'],
       [b'26'],
       [b'26']], dtype=object)>,
 'Age_encoded': <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[-0.47555923],
       [-0.47555923],
       [-0.47555923]], dtype=float32)>}

{'Age': <tf.Tensor: shape=(3, 1), dtype=string, numpy=
array([[b'62'],
       [b'22'],
       [b'62']], dtype=object)>,
 'Age_encoded': <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[ 2.5

### Zip Code

Prepare zip-code column for one-hot encoding each character

In [15]:
DEFAULT_ZIP_CODE, FIRST_K_ZIP_DIGITS = "00000", 2

zip_code_indexer = IndexerForVocab(string.digits + string.ascii_lowercase + string.ascii_uppercase)

In [16]:
def fix_zip_code_tensor(zip_code:tf.string, n_digits, indexer) -> List[str]:
    """Extracts the the first n_digits as a list"""
    zip_digits = []
    try:
        if isinstance(zip_code, tf.Tensor):
            zip_code = zip_code.numpy()[0].decode('ascii', errors="ignore") # very ineffecient way
        zip_digits = list(zip_code.strip()[:n_digits])
    except:
        zip_digits = list(DEFAULT_ZIP_CODE[:n_digits])
    return tf.concat( [
        tf.one_hot(
            indexer.index_of(d), len(indexer)
        ) for d in zip_digits
    ], 0 )

def fix_zip_code(zip_code:str, n_digits, indexer) -> List[str]:
    """Extracts the the first n_digits as a list"""
    zip_digits = []
    try:
        zip_digits = list(zip_code.strip()[:n_digits])
    except:
        zip_digits = list(DEFAULT_ZIP_CODE[:n_digits])
    return np.ravel(np.eye(len(indexer))[indexer.index_of_mux(zip_digits)])

def fix_zip_code_tf(example:Dict, n_digits=FIRST_K_ZIP_DIGITS, indexer=zip_code_indexer, new_col_suffix=""):
    """Creates new columns for the first n_digits in zip_code"""
    fix_zip_code_fn = partial(fix_zip_code, n_digits=n_digits, indexer=indexer)
    example[ZIP_CODE + new_col_suffix] = tf.py_function(fix_zip_code_fn, [example[ZIP_CODE]], tf.float32)
    example[ZIP_CODE + new_col_suffix].set_shape(len(indexer) * n_digits)
    return example

#### Visual Tests

In [17]:
test_zip_code_indexer = IndexerForVocab(string.digits)

(fix_zip_code("43556", 10, test_zip_code_indexer),
fix_zip_code("43556", 2, test_zip_code_indexer),
fix_zip_code("43556", 4, test_zip_code_indexer),
fix_zip_code(None, 3, test_zip_code_indexer))

(array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [18]:
test_zip_code_indexer = IndexerForVocab(string.digits)

(fix_zip_code(tf.constant([b"43556"], shape=(1,), dtype=tf.string), 10, test_zip_code_indexer),
fix_zip_code(tf.constant([b"43556"], shape=(1,), dtype=tf.string), 2, test_zip_code_indexer),
fix_zip_code(tf.constant([b"43556"], shape=(1,), dtype=tf.string), 4, test_zip_code_indexer),
fix_zip_code(tf.constant([43556], shape=(1,), dtype=tf.int32), 4, test_zip_code_indexer),\
fix_zip_code(None, 3, test_zip_code_indexer))

(array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [19]:
fix_zip_code_tf_fn = partial(fix_zip_code_tf, new_col_suffix="_encoded")
for d in ad_dataset(1, True).map(fix_zip_code_tf_fn).batch(5).take(3):
    pprint(dict_project(d, [ZIP_CODE, ZIP_CODE + "_encoded"]))
    print()

{'CapZipCode': <tf.Tensor: shape=(5, 1), dtype=string, numpy=
array([[b'NW1 1EU'],
       [b'60638'],
       [b'54942'],
       [b'54942'],
       [b'15613']], dtype=object)>,
 'CapZipCode_encoded': <tf.Tensor: shape=(5, 126), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.

### Favorite Sports

Two approaches,
1. Consider the first `K` sports mentioned by each user and one-hot encode each separately
2. Multi label binarize all the sports as there are only 15 unique sports

In [20]:
FAV_SPORTS_UNKNOWN = "UNK_SPORT"
ALL_FAV_SPORTS = ['Olympic sports', 'Winter sports', 'Nothing', 'I do not like Sports', 'Equestrian sports', 'Skating sports', 'Precision sports', 'Hunting sports', 'Motor sports', 'Team sports', 'Individual sports', 'Other', 'Water sports', 'Indoor sports', 'Endurance sports']

fav_sports_binarizer = MultiLabelBinarizer()
fav_sports_binarizer.fit([ALL_FAV_SPORTS])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [21]:
# Attempt to write purely in TF graph
# def fix_fav_sports(sports_str:str, topk=2, pad_constant="PAD_SPORT") -> List[str]:
#     sports = tf.strings.regex_replace(sports_str, r"\s*\(.*,.*\)\s*", "")
#     sports = tf.strings.regex_replace(sports, r"\s*,\s*", ",")
#     sports = tf.strings.split(sports, ",").numpy()[:topk]
#     tf.print(sports.shape[0])
#     right_pad_width = max(0, topk - sports.shape[0])
#     result = np.pad(sports, (0, right_pad_width), constant_values=pad_constant) 
#     return result

In [22]:
def fav_sports_multi_select_str_to_list(sports_str:Union[str, tf.Tensor]) -> List[str]:
    # remove commas that dont separate different user selections
    # example, commas inside paranthesis of "Individual sports (Tennis, Archery, ...)" dont make new sports
    if isinstance(sports_str, tf.Tensor):
        sports_str = sports_str.numpy()[0].decode('ascii', errors="ignore")
    else:
        sports_str = sports_str.encode("ascii", errors="ignore").decode("ascii") # remove non-ascii chars
    sports = re.sub(r"\s*\(.*,.*\)\s*", "", sports_str)
    return re.split(r"\s*,\s*", sports)

def fix_fav_sports_mlb(sports_str:str) -> List[int]:
    sports = fav_sports_multi_select_str_to_list(sports_str)
    return fav_sports_binarizer.transform([sports])[0]

def fix_fav_sports_firstk(sports_str:str, first_k:int, pad_constant:int) -> List[str]:
    sports = fav_sports_multi_select_str_to_list(sports_str)
    right_pad_width = first_k - len(sports_enc)
    result = [sports + [pad_constant] * right_pad_width][:first_k]
    return result

def fix_fav_sports_tf(example:Dict, first_k=2, pad_constant="PAD_SPORT", new_col_suffix:str=""):
    """Existing column will not be overriden with new_col_suffix"""
    example[FAVE_SPORTS + new_col_suffix] = tf.py_function(fix_fav_sports_mlb, [example[FAVE_SPORTS]], tf.float32)
    example[FAVE_SPORTS + new_col_suffix].set_shape(len(ALL_FAV_SPORTS))
    return example

#### Visual Tests

In [23]:
(
    fix_fav_sports_mlb(tf.constant([b"Individual sports (Tennis, Archery, ...), Indoor sports, Endurance sports, Skating sports"])),
    fix_fav_sports_mlb(tf.constant([b"Skating sports"])),
    fix_fav_sports_mlb(tf.constant([b"Individual sports (Tennis, Archery, ...)"])),
    fix_fav_sports_mlb(tf.constant([b"Indoor sports, Endurance sports, Skating sports"])),
)

(array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]))

In [24]:
(
    fix_fav_sports_mlb("Individual sports (Tennis, Archery, ...), Indoor sports, Endurance sports, Skating sports"),
    fix_fav_sports_mlb("Skating sports"),
    fix_fav_sports_mlb("Individual sports (Tennis, Archery, ...)"),
    fix_fav_sports_mlb("Indoor sports, Endurance sports, Skating sports"),
)

(array([1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]))

In [25]:
fix_fav_sports_tf_fn = partial(fix_fav_sports_tf, new_col_suffix="_new")
for d in ad_dataset(1, True).map(fix_fav_sports_tf_fn).batch(5).take(2):
    pprint(dict_project(d, [FAVE_SPORTS, FAVE_SPORTS + "_new"]))
    print()

{'FaveSports': <tf.Tensor: shape=(5, 1), dtype=string, numpy=
array([[b'Individual sports\xe2\x80\x8e (Tennis, Archery, ...)'],
       [b'Nothing'],
       [b'Team sports (Footbal, Baseball, Rugby, ...) '],
       [b'Team sports (Footbal, Baseball, Rugby, ...) '],
       [b'I do not like Sports']], dtype=object)>,
 'FaveSports_new': <tf.Tensor: shape=(5, 15), dtype=float32, numpy=
array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)>}

{'FaveSports': <tf.Tensor: shape=(5, 1), dtype=string, numpy=
array([[b'I do not like Sports'],
       [b'Nothing'],
       [b'Nothing'],
       [b'Nothing'],
       [b'I do not like Sports']], dtype=object)>,
 'FaveSports_new': <tf.Tensor: shape

### Target

In [26]:
RATINGS_CARDINALITY = 5 # not zero based indexing i.e. ratings range from 1 to 5

In [27]:
def create_target(example:Dict):
    y = tf.one_hot(
        tf.cast(tf.strings.to_number(example[RATING], tf.float32), tf.int32), 
        RATINGS_CARDINALITY)
    example.pop(RATING)
    
    return example, y

In [28]:
def create_target_pd(rating_str:str):
    return np.eye(RATINGS_CARDINALITY, dtype=int)[int(float(rating_str)) - 1]

## Featurize

In [29]:
def create_dataset_tf() -> tf.data.Dataset:
    return ad_dataset(1, True).\
        map(fix_age_tf, tf.data.experimental.AUTOTUNE).\
        map(fix_zip_code_tf, tf.data.experimental.AUTOTUNE).\
        map(fix_fav_sports_tf, tf.data.experimental.AUTOTUNE).\
        map(create_target, tf.data.experimental.AUTOTUNE)

In [30]:
# Credits: https://www.tensorflow.org/tutorials/customization/custom_training_walkthrough?hl=en#create_a_tfdatadataset
def pack_features_vector(features:Dict, labels, cols:List[str]=[AGE]):
    """Pack the features into a single array for the list of cols"""
    # features = tf.stack(list(dict_project(features, cols).values()), axis=1)
    features = tf.concat(list(dict_project(features, cols).values()), axis=1)
    return features, labels

In [32]:
for d in create_dataset_tf().batch(2).map(pack_features_vector).take(2):
    pprint(d)

(<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[-0.6412593 ],
       [-0.47555923]], dtype=float32)>,
 <tf.Tensor: shape=(2, 1, 5), dtype=float32, numpy=
array([[[0., 1., 0., 0., 0.]],

       [[0., 1., 0., 0., 0.]]], dtype=float32)>)
(<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[-0.8069594],
       [-0.8069594]], dtype=float32)>,
 <tf.Tensor: shape=(2, 1, 5), dtype=float32, numpy=
array([[[0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0.]]], dtype=float32)>)


Note down the number of columns from here to provide in keras input layer

In [46]:
def transform_pd_X(df:pd.DataFrame, inp_cols:List[str]):
    """Original dataframe will be modified"""
    df[AGE] = df[AGE].apply(lambda age: [fix_age(age)])
    df[ZIP_CODE] = df[ZIP_CODE].apply(lambda zc: fix_zip_code(zc, n_digits=2, indexer=zip_code_indexer))
    df[FAVE_SPORTS] = df[FAVE_SPORTS].apply(fix_fav_sports_mlb)
    df[GENDER_F] = df[GENDER_F].apply(lambda gender_f: [int(gender_f)])
    df[GENDER_M] = df[GENDER_M].apply(lambda gender_m: [int(gender_m)])
    df["X"] = df[inp_cols].apply(np.concatenate, axis=1)
    # TODO: vectorize, else inefficient to sequentially loop over all example
    X = np.array([x for x in df["X"]])
    return X

In [47]:
def transform_pd_y(df:pd.DataFrame, target_col:str):
    """Original dataframe will be modified"""
    df["y"] = df[target_col].apply(create_target_pd)
    # TODO: vectorize, else inefficient to sequentially loop over all example
    y = np.array([y for y in df["y"]])
    return y

In [48]:
def create_dataset_pd(inp_cols:List[str]=SELECTED_INP_COLS, target_col:str=TARGET_COL, fraction:float=1) -> pd.DataFrame:
    """Prepare the dataset for training on a fraction of all input data"""
    df = ad_dataset_pd().sample(frac=fraction)
    return transform_pd_X(df, inp_cols), transform_pd_y(df, target_col)

## Input Pipeline

In [402]:
# Input builders
def input_fn_train(batch_size=10):
    return create_dataset().\
        shuffle(2 * batch_size).batch(batch_size, drop_remainder=True).\
        map(pack_features_vector, tf.data.experimental.AUTOTUNE).\
        cache().prefetch(tf.data.experimental.AUTOTUNE)

def input_fn_eval(batch_size=10, cache=True):
    # TODO: use dataset's skip & take to create train and validation datasets
    val_dataset = create_dataset(test_files).batch(batch_size)
    if cache: val_dataset = val_dataset.cache()
    return val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

def input_fn_predict():
    # return tf.data.Dataset.from_tensor_slices({"x": tf.cast(X_test, tf.int32)}).batch(1)
    pass

### Visual Test

In [403]:
for d in input_fn_train(2).take(2):
    pprint(d)

(<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[-0.47555923],
       [-0.6412593 ]], dtype=float32)>,
 <tf.Tensor: shape=(2, 1, 5), dtype=float32, numpy=
array([[[0., 1., 0., 0., 0.]],

       [[0., 1., 0., 0., 0.]]], dtype=float32)>)
(<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[2.5070422],
       [2.5070422]], dtype=float32)>,
 <tf.Tensor: shape=(2, 1, 5), dtype=float32, numpy=
array([[[0., 1., 0., 0., 0.]],

       [[0., 1., 0., 0., 0.]]], dtype=float32)>)


## Tensorboard

Monitor training and other stats

In [36]:
from tensorboard import notebook

In [37]:
%reload_ext tensorboard

Start tensorboard

In [38]:
%tensorboard --logdir logs --port 6006

In [32]:
notebook.list()

Known TensorBoard instances:
  - port 6006: logdir logs (started 11:19:27 ago; pid 77286)


## Model

Create a model and train using high level APIs like `tf.keras` and `tf.estimator`

In [49]:
%%time

# train_dataset = input_fn_train(BATCH_SIZE)
X, y = create_dataset_pd()

CPU times: user 16.2 s, sys: 920 ms, total: 17.1 s
Wall time: 17.8 s


In [59]:
# DON'T CHANGE THE EPOCHS VALUE
BATCH_SIZE = 4096
EPOCHS = 1000
keras_model_metrics = ["accuracy"]
train_histories = []

In [60]:
logdir = Path("logs")/datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    logdir, 
    histogram_freq=max(1, ceil(EPOCHS / 20)), # to control the amount of logging
#     embeddings_freq=epochs,
)
print(f"Logging tensorboard data at {logdir}")

Logging tensorboard data at logs/20200430-021723


In [61]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(20, input_shape=(X.shape[1],), activation=tf.keras.layers.LeakyReLU()),
    tf.keras.layers.Dense(RATINGS_CARDINALITY , activation='softmax')
])

model.compile(
    optimizer=tf.optimizers.Adam(
        learning_rate=0.003,
        clipvalue=0.5
    ), 
#     optimizer=tf.keras.optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True), 
#     optimizer=tf.keras.optimizers.RMSprop(lr),
#     loss=tf.nn.softmax_cross_entropy_with_logits,
    loss="categorical_crossentropy",
    metrics=keras_model_metrics
)

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 20)                2900      
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 105       
Total params: 3,005
Trainable params: 3,005
Non-trainable params: 0
_________________________________________________________________


In [62]:
%%time

train_histories.append(model.fit(
    X, y,
    BATCH_SIZE,
    epochs=EPOCHS, 
    callbacks=[tensorboard_callback, tfdocs.modeling.EpochDots()],
    validation_split=0.2,
    verbose=0
))


Epoch: 0, accuracy:0.3800,  loss:1.5123,  val_accuracy:0.5437,  val_loss:1.4133,  
....................................................................................................
Epoch: 100, accuracy:0.6156,  loss:1.0363,  val_accuracy:0.6110,  val_loss:1.0493,  
....................................................................................................
Epoch: 200, accuracy:0.6161,  loss:1.0190,  val_accuracy:0.6136,  val_loss:1.0379,  
....................................................................................................
Epoch: 300, accuracy:0.6170,  loss:1.0124,  val_accuracy:0.6151,  val_loss:1.0313,  
....................................................................................................
Epoch: 400, accuracy:0.6149,  loss:1.0087,  val_accuracy:0.6128,  val_loss:1.0287,  
....................................................................................................
Epoch: 500, accuracy:0.6165,  loss:1.0083,  val_accuracy:0.6126,  val_l

In [67]:
histories_dict = train_histories[-1].history
for metric in histories_dict.keys():
    print(metric, histories_dict[metric][-1])

loss 1.004807571834988
accuracy 0.6155556
val_loss 1.0253095727496677
val_accuracy 0.61430556


## Export

Save the model for future reference

In [441]:
model.save((logdir/"keras_saved_model").as_posix(), save_format="tf")

INFO:tensorflow:Assets written to: logs/20200428-102840/keras_saved_model/assets


## Predict

In [448]:
PredictionReport = namedtuple("PredictionReport", "probabilities predicted_rating confidence")

test_df = pd.DataFrame({
    AGE: ["45"],
    ZIP_CODE: ["94086"],
    FAVE_SPORTS: ["I do not like Sports"]
})

probabilities = model.predict(transform_pd_X(test_df, SELECTED_INP_COLS))
predicted_rating, confidence = np.argmax(probabilities), np.max(probabilities)

PredictionReport(probabilities, predicted_rating, confidence)

PredictionReport(probabilities=array([[0.9737386 , 0.0085956 , 0.01389753, 0.00218609, 0.00158213]],
      dtype=float32), predicted_rating=0, confidence=0.9737386)

## Rough

### Featurize using Feature Columns

Create feature columns like one-hot, embeddings, bucketing from raw features created earlier

In [38]:
EXAMPLE_BATCH = next(iter(input_fn_train(3)))[0]

In [39]:
EXAMPLE_BATCH

<tf.Tensor: shape=(3, 142), dtype=float32, numpy=
array([[-0.8069594 ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
 

In [40]:
def test_feature_column(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    return feature_layer(EXAMPLE_BATCH).numpy()

In [41]:
age_fc = tf.feature_column.numeric_column(AGE, normalizer_fn=lambda x: (x - MEAN_AGE) / STD_AGE)

In [42]:
zip_fcs = [
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            f"{ZIP_CODE}{i}", vocabulary_list=list(string.digits), 
            num_oov_buckets=1)
    )
    for i in range(FIRST_K_ZIP_DIGITS)
]

In [43]:
EXAMPLE_BATCH[AGE], test_feature_column(age_fc)

TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'Age'

In [None]:
{k: v for k, v in EXAMPLE_BATCH.items() if k.startswith(ZIP_CODE)}, test_feature_column(zip_fcs)

In [None]:
tf.keras.layers.concatenate(age_fc, zip_fcs[0])