In [None]:
import os
import math
from zipfile import ZipFile
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import StringLookup
import torch

Loading the dataset

In [None]:
users = pd.read_csv("drive/MyDrive/RAW_interactions.csv")
recipes = pd.read_csv("drive/MyDrive/RAW_recipes.csv")

In [None]:
users.columns

Index(['user_id', 'recipe_id', 'date', 'rating', 'review'], dtype='object')

In [None]:
recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

Preprocessing the dataset

In [None]:
users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
users["recipe_id"] = users["recipe_id"].apply(lambda x: f"recipe_{x}")
users["rating"] = users["rating"].apply(lambda x: float(x))

In [None]:
recipes["id"] = recipes["id"].apply(lambda x: f"recipe_{x}")
recipes["minutes"] = recipes["minutes"].apply(lambda x: f"recipe_{x}")
recipes["n_steps"] = recipes["n_steps"].apply(lambda x: f"recipe_{x}")
recipes["n_ingredients"] = recipes["n_ingredients"].apply(lambda x: f"recipe_{x}")

Treating the tags and the ingredients as one-hot encoded features for the model.

In [None]:
recipe_tags = []
for val in recipes.tags:
  for v in val.split(','):
    recipe_tags.append(v)
recipe_tags = set(recipe_tags)

In [None]:
for tag in recipe_tags:
    recipes[tag] = recipes["tags"].apply(
        lambda values: int(tag in values.split("|"))
    )

  recipes[tag] = recipes["tags"].apply(


In [None]:
recipe_ig = []
for val in recipes.ingredients:
  for v in val.split(','):
    recipe_ig.append(v)
recipe_ig = set(recipe_ig)
for ig in recipe_ig:
    recipes[ig] = recipes["ingredients"].apply(
        lambda values: int(ig in values.split("|"))
    )

In [None]:
ratings_group = users.sort_values(by=["date"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "recipe_ids": list(ratings_group.recipe_id.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "timestamps": list(ratings_group.date.apply(list))
    }
)

In [None]:
ratings_data

Creating the recipes sequences per user (Each sequence being 4 recipe long)

In [None]:
sequence_length = 4
step_size = 2


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.recipe_ids = ratings_data.recipe_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)
del ratings_data["timestamps"]

In [None]:
ratings_data

In [None]:
ratings_data_movies = ratings_data[["user_id", "recipe_ids"]].explode(
    "recipe_ids", ignore_index=True
)
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
ratings_data_transformed = ratings_data_transformed[ratings_data_transformed['recipe_ids'].notna()]
# ratings_data_transformed = ratings_data_transformed.join(
#     recipes_n.set_index("recipe_ids"), on="recipe_ids"
)
# ratings_data_transformed.recipe_ids = ratings_data_transformed.recipe_ids.apply(
#     lambda x: ",".join(x)
# )

ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
    lambda x: ",".join([str(v) for v in x])
)


ratings_data_transformed.rename(
    columns={"recipe_ids": "sequence_recipe_ids", "ratings": "sequence_ratings"},
    inplace=True,
)

In [None]:
ratings_data_transformed

Dividing to train and test set

In [None]:
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

train_data.to_csv("train_data.csv", index=False, sep="|", header=False)
test_data.to_csv("test_data.csv", index=False, sep="|", header=False)

In [None]:
CSV_HEADER = list(ratings_data_transformed.columns)

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "user_id": list(users.user_id.unique()),
    "recipe_id": list(users.recipe_id.unique()),
    # "minutes": list(recipes.minutes.unique()),
    # "n_steps": list(recipes.n_steps.unique()),
    # "n_ingredients": list(recipes.n_ingredients.unique()),
}

# USER_FEATURES = ["sex", "age_group", "occupation"]

# RECIPE_FEATURES = ["minutes","n_steps","n_ingredients"]

Intializing the model

In [None]:
def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    def process(features):
        recipe_ids_string = features["sequence_recipe_ids"]
        sequence_recipe_ids = tf.strings.split(recipe_ids_string, ",").to_tensor()

        # The last movie id in the sequence is the target movie.
        features["target_recipe_id"] = sequence_recipe_ids[:, -1]
        features["sequence_recipe_ids"] = sequence_recipe_ids[:, :-1]

        ratings_string = features["sequence_ratings"]
        sequence_ratings = tf.strings.to_number(
            tf.strings.split(ratings_string, ","), tf.dtypes.float32
        ).to_tensor()

        # The last rating in the sequence is the target for the model to predict.
        target = sequence_ratings[:, -1]
        features["sequence_ratings"] = sequence_ratings[:, :-1]

        return features, target

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        num_epochs=1,
        header=False,
        field_delim="|",
        shuffle=shuffle,
    ).map(process)

    return dataset

In [None]:
def create_model_inputs():
    return {
        "user_id": layers.Input(name="user_id", shape=(1,), dtype=tf.string),
        "sequence_recipe_ids": layers.Input(
            name="sequence_recipe_ids", shape=(sequence_length - 1,), dtype=tf.string
        ),
        "target_recipe_id": layers.Input(
            name="target_recipe_id", shape=(1,), dtype=tf.string
        ),
        "sequence_ratings": layers.Input(
            name="sequence_ratings", shape=(sequence_length - 1,), dtype=tf.float32
        ),
        # "minutes": layers.Input(name="minutes", shape=(1,), dtype=tf.string),
        # "n_steps": layers.Input(name="n_steps", shape=(1,), dtype=tf.string),
        # "n_ingredients": layers.Input(name="n_ingredients", shape=(1,), dtype=tf.string)
    }

Encoding the input features for the recipe

In [None]:
def encode_input_features(
    inputs,
    include_user_id=True,
    include_user_features=False,
    include_recipe_features=True,
):

    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("user_id")
    # if not include_recipe_features:
    #     other_feature_names.extend(RECIPE_FEATURES)

    ## Encode user features
    print(other_feature_names)
    for feature_name in other_feature_names:
        # Convert the string input values into integer indices.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=1)(
            inputs[feature_name]
        )
        # Compute embedding dimensions
        embedding_dims = int(math.sqrt(len(vocabulary)))
        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )
        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    ## Create a single embedding vector for the user features
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    ## Create a recipe embedding encoder
    recipe_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["recipe_id"]
    recipe_embedding_dims = int(math.sqrt(len(recipe_vocabulary)))
    # Create a lookup to convert string values to integer indices.
    recipe_index_lookup = StringLookup(
        vocabulary=recipe_vocabulary,
        mask_token=None,
        num_oov_indices=1,
        name="recipe_index_lookup",
    )
    # Create an embedding layer with the specified dimensions.
    recipe_embedding_encoder = layers.Embedding(
        input_dim=len(recipe_vocabulary),
        output_dim=recipe_embedding_dims,
        name=f"recipe_embedding",
    )
    # Create a vector lookup for movie genres.
    tag_vectors = recipes[recipe_tags].to_numpy()
    recipe_tags_lookup = layers.Embedding(
        input_dim=tag_vectors.shape[0],
        output_dim=tag_vectors.shape[1],
        embeddings_initializer=tf.keras.initializers.Constant(tag_vectors),
        trainable=False,
        name="tags_vector",
    )

    ig_vectors = recipes[recipe_ig].to_numpy()
    recipe_ig_lookup = layers.Embedding(
        input_dim=ig_vectors.shape[0],
        output_dim=ig_vectors.shape[1],
        embeddings_initializer=tf.keras.initializers.Constant(ig_vectors),
        trainable=False,
        name="ig_vector",
    )
    # Create a processing layer for genres.
    recipe_embedding_processor = layers.Dense(
        units=recipe_embedding_dims,
        activation="relu",
        name="process_recipe_embedding_with_tags",
    )


    ## Define a function to encode a given movie id.
    def encode_recipe(recipe_id):
        # Convert the string input values into integer indices.
        recipe_idx = recipe_index_lookup(recipe_id)
        recipe_embedding = recipe_embedding_encoder(recipe_idx)
        encoded_recipe = recipe_embedding
        if include_recipe_features:
            recipe_tags_vector = recipe_tags_lookup(recipe_idx)
            recipe_ig_vector = recipe_ig_lookup(recipe_idx)
            encoded_recipe = recipe_embedding_processor(
                layers.concatenate([recipe_embedding, recipe_tags_vector,recipe_ig_vector])
            )
        return encoded_recipe

    ## Encoding target_movie_id
    target_recipe_id = inputs["target_recipe_id"]
    encoded_target_recipe = encode_recipe(target_recipe_id)

    ## Encoding sequence movie_ids.
    sequence_recipes_ids = inputs["sequence_recipe_ids"]
    encoded_sequence_recipes = encode_recipe(sequence_recipes_ids)

   

    # Create positional embedding.
    position_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=recipe_embedding_dims,
        name="position_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encodded_positions = position_embedding_encoder(positions)
    # Retrieve sequence ratings to incorporate them into the encoding of the movie.
    sequence_ratings = tf.expand_dims(inputs["sequence_ratings"], -1)
    # Add the positional encoding to the movie encodings and multiply them by rating.
    encoded_sequence_recipes_with_poistion_and_rating = layers.Multiply()(
        [(encoded_sequence_recipes + encodded_positions), sequence_ratings,]
    )

    # Construct the transformer inputs.
    for encoded_recipe in tf.unstack(
        encoded_sequence_recipes_with_poistion_and_rating, axis=1
    ):
        encoded_transformer_features.append(tf.expand_dims(encoded_recipe, 1))
    encoded_transformer_features.append(encoded_target_recipe)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1
    )

    return encoded_transformer_features, encoded_other_features

Creating the model

In [None]:
include_user_id = False
include_user_features = False
include_recipe_features = False

hidden_units = [256, 128]
dropout_rate = 0.1
num_heads = 3


def create_model():
    inputs = create_model_inputs()
    print(inputs)
    transformer_features, other_features = encode_input_features(
        inputs, include_user_id, include_user_features, include_recipe_features
    )
    print(transformer_features)

    # Create a multi-headed attention layer.
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
    )(transformer_features, transformer_features)

    # Transformer block.
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    x1 = layers.Add()([transformer_features, attention_output])
    x1 = layers.LayerNormalization()(x1)
    x2 = layers.LeakyReLU()(x1)
    x2 = layers.Dense(units=x2.shape[-1])(x2)
    x2 = layers.Dropout(dropout_rate)(x2)
    transformer_features = layers.Add()([x1, x2])
    transformer_features = layers.LayerNormalization()(transformer_features)
    features = layers.Flatten()(transformer_features)

    # Included the other features.
    if other_features is not None:
        features = layers.concatenate(
            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
        )

    # Fully-connected layers.
    for num_units in hidden_units:
        features = layers.Dense(num_units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.LeakyReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=1)(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


model = create_model()

Setting the metrics - NDCG and MRR and loss function - MAE for training the model

In [None]:
# import tensorflow_ranking as tfr
from keras.metrics import top_k_categorical_accuracy

eval_metrics = [
    tfr.keras.metrics.get(key="ndcg", name="metric/ndcg", ragged=False),
    tfr.keras.metrics.get(key="mrr", name="metric/mrr", ragged=False)
]

model.compile(
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError(),'top_k_categorical_accuracy',eval_metrics],
)

# Read the training data.
train_dataset = get_dataset_from_csv("train_data.csv", shuffle=True, batch_size=265)

# Fit the model with the training data.
model.fit(train_dataset, epochs=5)

# Read the test data.
test_dataset = get_dataset_from_csv("test_data.csv", batch_size=265)

# Evaluate the model on the test data.
_, rmse = model.evaluate(test_dataset, verbose=0)
print(f"Test MAE: {round(rmse, 3)}")


Evaluating the test set

In [None]:
model.evaluate(test_dataset, verbose=0)

# [0.9269737601280212,
#  0.5811803936958313,
#  0.9725525379180908 - ndcg,
#  0.9725525379180908]