In [8]:
from tensorflow.keras import layers
import math
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
import json

In [9]:
f = open('mt_dict.json',)
mt_dict = json.load(f)
mt_dict = {int(k):v for k, v in mt_dict.items()}


f = open('wh_dict.json',)
wh_dict = json.load(f)
wh_dict = {int(k):v for k, v in wh_dict.items()}


f = open('wh_to_group.json',)
wh_to_group = json.load(f)
wh_to_group = {int(k):v for k, v in wh_to_group.items()}


f = open('wh_to_mid.json',)
wh_to_mid = json.load(f)
wh_to_mid = {int(k):v for k, v in wh_to_mid.items()}


In [10]:
CSV_HEADER = ['Wholesaler', 'material', 'hl_seq', 'group', 'mid']

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "Wholesaler": list(wh_dict.values()),
    "material": list(mt_dict.values()),
    "group": list(np.unique(np.array(list(wh_to_group.values())))),
    "mid": list(np.unique(np.array(list(wh_to_mid.values())))),
#     "occupation": list(users.occupation.unique()),
}

USER_FEATURES = ["group", "mid"]

MATERIAL_FEATURES = ["deg_alc"]

#################
sequence_length = 4
step_size = 1
def create_model_inputs():
    return {
        "Wholesaler": layers.Input(name="Wholesaler", shape=(1,), dtype=tf.string),
        "material": layers.Input(
            name="material", shape=(sequence_length - 1,), dtype=tf.string
        ),
        "target_material": layers.Input(
            name="target_material", shape=(1,), dtype=tf.string
        ),
        "hl_seq": layers.Input(
            name="hl_seq", shape=(sequence_length - 1,), dtype=tf.float32
        ),
        "group": layers.Input(name="group", shape=(1,), dtype=tf.string),
        "mid": layers.Input(name="mid", shape=(1,), dtype=tf.string),
    }



############
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        material_string = features["material"]
        material = tf.strings.split(material_string, ",").to_tensor()

        # The last material id in the sequence is the target material.
        features["target_material"] = material[:, -1]
        features["material"] = material[:, :-1]

        hl_seq_string = features["hl_seq"]
        hl_seq = tf.strings.to_number(
            tf.strings.split(hl_seq_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        target = (hl_seq[:, -1])
        features["hl_seq"] = hl_seq[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset



def encode_input_features(
    inputs,
    include_user_id=True,
    include_user_features=True,
    include_material_features=True,
):

    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("Wholesaler")
    if include_user_features:
        other_feature_names.extend(USER_FEATURES)

    ## Encode user features
    for feature_name in other_feature_names:
        # Convert the string input values into integer indices.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(
            inputs[feature_name]
        )
        # Compute embedding dimensions
        embedding_dims = int(math.sqrt(len(vocabulary)))
        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    ## Create a single embedding vector for the user features
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a material embedding encoder
    material_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["material"]
    material_embedding_dims = int(math.sqrt(len(material_vocabulary)))
    # Create a lookup to convert string values to integer indices.
    material_index_lookup = StringLookup(
        vocabulary=material_vocabulary,
        mask_token=None,
        num_oov_indices=0,
        name="material_index_lookup",
    )
    # Create an embedding layer with the specified dimensions.
    material_embedding_encoder = layers.Embedding(
        input_dim=len(material_vocabulary),
        output_dim=material_embedding_dims,
        name=f"material_embedding",
    )
#     # Create a vector lookup for material features.
#     material_feature_vectors = deg_alc.to_numpy()
#     material_feature_lookup = layers.Embedding(
#         input_dim=material_feature_vectors.shape[0],
#         output_dim=material_feature_vectors.shape[1],
#         embeddings_initializer=tf.keras.initializers.Constant(material_feature_vectors),
#         trainable=False,
#         name="features_vector",
#     )
#     # Create a processing layer for genres.
#     material_embedding_processor = layers.Dense(
#         units=material_embedding_dims,
#         activation="relu",
#         name="process_material_embedding_with_genres",
#     )

    ## Define a function to encode a given material id.
    def encode_material(material_id):
        # Convert the string input values into integer indices.
        material_idx = material_index_lookup(material_id)
        material_embedding = material_embedding_encoder(material_idx)
        encoded_material = material_embedding
        if include_material_features:
            material_feature_vector = material_feature_lookup(material_idx)
            encoded_material = material_embedding_processor(
                layers.concatenate([material_embedding, material_feature_vector])
            )
        return encoded_material

    ## Encoding target_material_id
    target_material_id = inputs["target_material"]
    encoded_target_material = encode_material(target_material_id)

    ## Encoding sequence material_ids.
    sequence_material_ids = inputs["material"]
    encoded_sequence_material = encode_material(sequence_material_ids)
    # Create positional embedding.
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=material_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    # Retrieve sequence ratings to incorporate them into the encoding of the material.
    sequence_ratings = tf.expand_dims(inputs["hl_seq"], -1)
    # Add the positional encoding to the material encodings and multiply them by rating.
    encoded_sequence_material_with_poistion_and_rating = layers.Multiply()(
        [(encoded_sequence_material + encodded_positions), sequence_ratings]
    )

    # Construct the transformer inputs.
    for encoded_material in tf.unstack(
        encoded_sequence_material_with_poistion_and_rating, axis=1
    ):
        encoded_transformer_features.append(tf.expand_dims(encoded_material, 1))
    encoded_transformer_features.append(encoded_target_material)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features


#################





include_user_id = True
include_user_features = True
include_material_features = False

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3


def create_model():
    inputs = create_model_inputs()
    transformer_features, other_features = encode_input_features(
        inputs, include_user_id, include_user_features, include_material_features
    )

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


M = create_model()

################################


In [18]:
def transform_to_id(l, d):
    s=""
    for i in l:
        s+=d[i]+","
    return s[:-1]

def tune_HL(a):
    a=str(a)
    a=a.replace(", ", ",")
    a=a.replace("[", "")
    a=a.replace("]", "")
    return a


#########   Input  ##############
wh = 29606863; 
mat = [57005, 3372, 6013, 9974]; 
hl = [25.2,14.4,6.4,8.0]

##note: un-comment this if using log model
# hl = np.log(1+np.array(hl))

foo = {
    "Wholesaler": ["WH_1"],
    "material": [transform_to_id(mat, mt_dict)],
    "hl_seq": [tune_HL(hl)],
    "group": [wh_to_group[wh]],
    "mid": [wh_to_mid[wh]]
      }

In [19]:
pd.DataFrame(foo).to_csv("foo.csv", index=False, sep="|", header=False)


In [20]:
foo_dataset = get_dataset_from_csv("foo.csv", shuffle=False, batch_size=64)

In [24]:
M.load_weights("HL_model_log")
(M.predict(foo_dataset))

array([[1.7742944]], dtype=float32)