In [5]:
import tensorflow as tf
tf.enable_eager_execution()
print(tf.__version__)

1.15.0


# Deployment package

In [120]:
def dynamic_padding(inp, min_size=100):
    # https://stackoverflow.com/questions/42334646/tensorflow-pad-unknown-size-tensor-to-a-specific-size
    pad_size = min_size - tf.shape(inp)[0]
    paddings = [[0, pad_size]] # Pad behind the name with spaces to align with padding from to_tensor default_value
    return tf.pad(inp, paddings, mode="CONSTANT", constant_values=" ")

def x_preprocess(x):
    x_processed = tf.strings.lower(x)
    x_processed = tf.strings.unicode_split(x_processed, input_encoding="UTF-8").to_tensor(default_value=" ") 

    # Pad only if necessary
    filter_size = 100
    x_processed = tf.cond(tf.less(tf.shape(x_processed)[1], filter_size), 
                        true_fn=lambda: tf.map_fn(lambda inp_name: dynamic_padding(inp_name, filter_size), x_processed), 
                        false_fn=lambda: tf.map_fn(lambda inp_name: tf.slice(inp_name, tf.constant([0]), tf.constant([100])), x_processed))

    # Convert to number
    x_processed = tf.strings.unicode_decode(x_processed, 'UTF-8')-96 # make a=1
    x_processed = tf.map_fn(lambda item: (tf.map_fn(lambda subitem: 0 if (subitem[0]<0 or subitem[0]>26)else subitem[0], item)), x_processed.to_tensor()) # To remove negative value on space (32-96 = -64 and set the shape correctly)
    x_processed = tf.cast(x_processed, tf.float32)
    
    return x_processed

def to_tensor_format(input_name):
    # Convert name to number
    input_name = tf.constant(input_name)
    x_processed = tf.map_fn(lambda name: x_preprocess([name]), input_name, dtype=tf.float32)

    return x_processed

def predict(instances, **kwargs):
    imported = tf.saved_model.load_v2("gs://leo-models/gender_prediction/models/2/")
    f = imported.signatures["serving_default"]
    
    # Input Pre-Process
    x_processed = to_tensor_format(instances)
    
    # Predict
    predictions = tf.map_fn(lambda x:f(x)["dense"], x_processed)
    predictions = tf.map_fn(lambda pred: tf.squeeze(pred), predictions)


    # Classes
    class_names = tf.constant(["f", "m"], dtype=tf.string)

    # Predictions are output from sigmoid so float32 in range 0 -> 1
    # Round to integers for predicted class and string lookup for class name
    prediction_integers = tf.cast(tf.math.round(predictions), tf.int32) 
    predicted_classes = tf.map_fn(lambda idx: class_names[idx], prediction_integers, dtype=tf.string)

    # Convert sigmoid output for probability
    # 1 (male) will remain at logit output
    # 0 (female) will be 1.0 - logit to give probability
    def to_probability(logit):
        if logit < 0.5:
            return 1.0 - logit
        else:
            return logit
    class_probability = tf.map_fn(to_probability, predictions, dtype=tf.float32)

    return {
        "gender": [gender.decode("utf-8") for gender in predicted_classes.numpy().tolist()],
        "probability": class_probability.numpy().tolist()
    }

In [121]:
reply = predict(["leo"])
reply

{'gender': ['m'], 'probability': [0.905020534992218]}

In [122]:
reply = predict(["leo", "shilpa"])
reply

{'gender': ['m', 'f'], 'probability': [0.905020534992218, 0.8464187979698181]}

# HTTP Request

In [1]:
from google.auth.transport import requests
from google.oauth2 import service_account

# Construct service account credentials using the service account key file.
credentials = service_account.Credentials.from_service_account_file('../credentials/ds-api-user.json')
credentials = credentials.with_scopes(['https://www.googleapis.com/auth/cloud-platform'])

# Create a requests Session object with the credentials.
session = requests.AuthorizedSession(credentials)

# Make an authenticated API request
url = "https://ml.googleapis.com/v1/projects/toped-ds-sandbox/models/name_gender_prediction:predict"
json = {"instances":[{"name":"stephen leo"}, {"name":"marie stephen leo"}]}
response = session.post(url, json=json)
print(response.json())

{'predictions': {'probability': [0.9067956805229187, 0.5227343440055847], 'gender': ['m', 'f']}}


# Create a json string with ~100 names for scalability testing

In [1]:
import pandas as pd

In [2]:
names_df = pd.read_csv("../data/train.csv")

In [11]:
f_df = names_df[names_df["gender"]=="f"].sample(n=50)
m_df = names_df[names_df["gender"]=="m"].sample(n=50)
names_sampled_df = pd.concat([f_df, m_df]).sample(frac=1)

In [39]:
names_dict = {"instances": names_sampled_df[["name"]].to_dict("records")}
import json
with open("../data/names_100.json", "w") as fp:
    json.dump(names_dict, fp)

# Test the accuracy on an open source ID names DB

In [5]:
# Dataset: https://github.com/vck/indonesian-name-gender-dataset
import pandas as pd
import numpy as np

In [2]:
names_df = pd.read_csv("../data/opensource_id_name_gender.csv")
names_dict = {"instances": names_df[["name"]].to_dict("records")}

In [3]:
# API Call
from google.auth.transport import requests
from google.oauth2 import service_account

# Construct service account credentials using the service account key file.
credentials = service_account.Credentials.from_service_account_file('../credentials/ds-api-user.json')
credentials = credentials.with_scopes(['https://www.googleapis.com/auth/cloud-platform'])

# Create a requests Session object with the credentials.
session = requests.AuthorizedSession(credentials)

# Make an authenticated API request
url = "https://ml.googleapis.com/v1/projects/toped-ds-sandbox/models/name_gender_prediction:predict"
response = session.post(url, json=names_dict)
print(response.json())

{'predictions': {'probability': [0.9462770819664001, 0.9425076842308044, 0.9759393930435181, 0.9757801294326782, 0.9566485285758972, 0.926899790763855, 0.814211368560791, 0.9690533876419067, 0.9796415567398071, 0.9440237283706665, 0.907305121421814, 0.9769575595855713, 0.9721669554710388, 0.8664155602455139, 0.8885946273803711, 0.9823637008666992, 0.9568223357200623, 0.9706600308418274, 0.9398284554481506, 0.9531610012054443, 0.7496070265769958, 0.9583103656768799, 0.9307056069374084, 0.9142642617225647, 0.9731149673461914, 0.97516930103302, 0.8685902953147888, 0.9249585270881653, 0.9176986217498779, 0.9401564598083496, 0.7543016672134399, 0.9426888227462769, 0.9833259582519531, 0.5924967527389526, 0.9788156747817993, 0.8972565531730652, 0.9445080757141113, 0.9481704235076904, 0.9694526791572571, 0.7925046682357788, 0.9142762422561646, 0.8615747094154358, 0.9744142889976501, 0.9191139936447144, 0.9412290453910828, 0.9155130982398987, 0.8980475068092346, 0.9845119714736938, 0.8710614442

In [7]:
# Accuracy Check
names_df["pred_gender"]=response.json()["predictions"]["gender"]
accuracy=np.where(names_df["gender"]==names_df["pred_gender"], 1, 0).sum()/names_df.shape[0]
print("Accuracy on ID names-gender open source data: {:.2f}%".format(accuracy*100))

Accuracy on ID names-gender open source data: 95.46%
