In [None]:
# Copyright 2023 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<img align="left" width="150" src="https://services.google.com/fh/files/misc/feedgen_logo.png" alt="feedgen_logo" />

# FeedGen
**Optimse Google Shopping feeds with Generative AI**

**Disclaimer: This is not an official Google product.**

**FeedGen** is an open-source tool that uses Google Cloud's state-of-the-art Large Language Models (LLMs) in Vertex AI and novel prompt-tuning to generate optimised Google Shopping ad titles and descriptions. It helps merchants and advertisers surface and fix quality issues in their Shopping feeds using Generative AI in a configurable, user-friendly and privacy-preserving manner.

More information available at [github.com/google/feedgen](https://github.com/google/feedgen).

## Get Started

In [None]:
#@title Authenticate your user for this colab session
import logging
from google.colab import auth

auth.authenticate_user()
logging.getLogger().setLevel(logging.INFO) # Change to logging.DEBUG for more fine-grained logs

In [None]:
#@title Install dependencies
!pip install tensorflow_text google-cloud-aiplatform==1.25.0

###❗ <span style="font-size: large">Do not forget to click the "Restart runtime" button above (if prompted), and then re-run the authentication cell</span>

## Input

Choose how you want to provide the input data (Google Sheets or Google Cloud BigQuery) and run the associated cells below.

In [None]:
from gspread.utils import is_scalar
#@title Common configurable parameters { run: 'auto' }
# #@markdown Select the desired input data source then run the associated cells below:
# desired_data_source = "Google Sheets" #@param ["Google Sheets", "Google Cloud BigQuery"]
# is_sheets_data_source = desired_data_source == 'Google Sheets'
# is_bigquery_data_source = desired_data_source == 'Google Cloud BigQuery'

from typing import Sequence

def convert_comma_seperated_str_to_list(
    comma_separated_string: str) -> Sequence[str]:
  if ',' in comma_separated_string:
    return comma_separated_string.replace(', ', ',').split(',')
  return [] if not comma_separated_string else [comma_separated_string]

#@markdown Choose the desired input data source:
FEED_INPUT_DATA_SOURCE = "Google Sheets" #@param ["Google Sheets", "Google Cloud BigQuery"]
IS_SHEETS_DATA_SOURCE = FEED_INPUT_DATA_SOURCE == 'Google Sheets'
IS_BIGQUERY_DATA_SOURCE = FEED_INPUT_DATA_SOURCE == 'Google Cloud BigQuery'

#@markdown Enter the column representing the ID of each feed item in your input:
FEED_ITEM_ID = "Item ID" # @param {type:"string"}

#@markdown Enter the column representing your brand name in the input:
FEED_BRAND_NAME = "Brand" # @param {type:"string"}

#@markdown Enter the column containing the current title of each feed item in your input:
FEED_ITEM_TITLE = "Title" # @param {type:"string"}

#@markdown Enter the column containing the current description of each feed item in your input:
FEED_ITEM_DESCRIPTION = "Description" # @param {type:"string"}

#@markdown Enter comma-separated column names representing the main feed item features that **must** be present in the generated titles and descriptions (existing titles and descriptions missing any one of those features will be flagged as invalid):
FEED_ITEM_MAIN_FEATURES = "Brand, Size, Color" #@param {type:"string"}
FEED_ITEM_MAIN_FEATURES = convert_comma_seperated_str_to_list(FEED_ITEM_MAIN_FEATURES)

#@markdown Enter which features from `FEED_ITEM_MAIN_FEATURES` need to be matched *exactly* as-is (e.g. the brand name):
FEED_ITEM_EXACT_MATCH_FEATURES = "Brand" #@param {type:"string"}
FEED_ITEM_EXACT_MATCH_FEATURES = convert_comma_seperated_str_to_list(FEED_ITEM_EXACT_MATCH_FEATURES)

#@markdown Enter a [Cosine Similarity](https://www.tensorflow.org/api_docs/python/tf/keras/losses/CosineSimilarity) threshold value for matching features in the titles and descriptions *semantically*:
FEED_ITEM_FEATURE_SIMILARITY_THRESHOLD = 0.6 #@param {type:"slider", min:0, max:1, step:0.1}

#@markdown Enter the minimum valid length for feed item descriptions (existing descriptions with lengths below that threshold will be flagged as invalid):
FEED_ITEM_DESCRIPTION_LENGTH_THRESHOLD = 500 #@param {type:"integer"}

#@markdown Enter the name of a column that contains a metric you would like to use for sorting the input feed items (e.g. *Clicks*).
#@markdown Low quality items (e.g. with the least amount of 'Clicks') will be sorted and processed first.
#@markdown <br>Leave empty to use the default sorting (which sorts the lowest quality feed items as identified by FeedGen's validation rules first):
FEED_SORT_METRIC = "" #@param {type:"string"}

# Non-user defined parameters
FEED_OUTPUT_ITEM_ID = 'Item ID' # Fixed for the output regardless of `FEED_ITEM_ID`
FEED_OUTPUT_TITLE = 'Original Title' # Fixed for the output regardless of `FEED_ITEM_TITLE`
FEED_OUTPUT_DESCRIPTION = 'Original Description' # Fixed for the output regardless of `FEED_ITEM_DESCRIPTION`
FEED_OUTPUT_GENERATED_TITLE = 'Generated Title'
FEED_OUTPUT_GENERATED_TITLE_STATUS = 'Generated Title Status'
FEED_OUTPUT_GENERATED_TITLE_SIMILARITY_SCORE = 'Generated Title Features Similarity Score'
FEED_OUTPUT_GENERATED_TITLE_VARIANTS = 'Generated Title Variants'
FEED_OUTPUT_GENERATED_TITLE_MODEL_PARAMS = 'Generated Title Model Parameters'
FEED_OUTPUT_MISSING_FEATURES_IN_TITLE = 'Missing Features in Original Title'
FEED_OUTPUT_MISSING_FEATURES_TITLE_SIMILARITY_SCORE = 'Original Title Features Similarity Score'
FEED_OUTPUT_MISSING_FEATURES_IN_GENERATED_TITLE = 'Missing Features in Generated Title'
FEED_OUTPUT_MISSING_FEATURES_IN_DESCRIPTION = 'Missing Features in Original Description'
FEED_OUTPUT_MISSING_FEATURES_DESCRIPTION_SIMILARITY_SCORE = 'Original Description Features Similarity Score'
FEED_OUTPUT_MISSING_FEATURES_IN_GENERATED_DESCRIPTION = 'Missing Features in Generated Description'
FEED_OUTPUT_GENERATED_DESCRIPTION = 'Generated Description'
FEED_OUTPUT_GENERATED_DESCRIPTION_STATUS = 'Generated Description Status'
FEED_OUTPUT_GENERATED_DESCRIPTION_SIMILARITY_SCORE = 'Generated Description Features Similarity Score'
FEED_OUTPUT_GENERATED_DESCRIPTION_VARIANTS = 'Generated Description Variants'
FEED_OUTPUT_GENERATED_DESCRIPTION_MODEL_PARAMS = 'Generated Description Model Parameters'

FEED_OUTPUT_STATUS_NEEDS_APROVAL = 'Needs Approval'
FEED_OUTPUT_STATUS_PRE_APPROVED = 'Pre-Approved'

FEED_SUMMARY_LOW_QUALITY_TITLES = 'Low Quality Original Titles'
FEED_SUMMARY_LOW_QUALITY_DESCRIPTIONS = 'Low Quality Original Descriptions'
FEED_SUMMARY_MISSING_DESCRIPTIONS = 'Missing Original Descriptions'
FEED_SUMMARY_SHORT_DESCRIPTIONS = f'Original Descriptions < {FEED_ITEM_DESCRIPTION_LENGTH_THRESHOLD} Characters'

# Validation rules
if not FEED_ITEM_MAIN_FEATURES:
  raise ValueError(
      'Invalid input! Please make sure at least ONE main feature is specified '
      'in "FEED_ITEM_MAIN_FEATURES"')

### Google Sheets

In [None]:
#@title Configurable parameters { run: 'auto' }

#@markdown Enter your spreadsheet ID:
SPREADSHEET_ID = "sheet-id-goes-here" #@param {type:"string"}

#@markdown Enter the main worksheet name which contains the input feed data:
INPUT_SHEET_NAME = "Input Feed" #@param {type:"string"}

# Validation rules
if IS_SHEETS_DATA_SOURCE and (not SPREADSHEET_ID or not INPUT_SHEET_NAME):
  raise ValueError(
      'Invalid input! Please make sure at least '
      '"SPREADSHEET_ID" AND "INPUT_SHEET_NAME" '
      'are provided.')


In [None]:
#@title Fetch data from the input spreadsheet
#@markdown The first row in each worksheet will be considered the **column headers** row.
import pandas as pd
import gspread
from google.auth import default

input_feed_data = None

if IS_SHEETS_DATA_SOURCE:
  creds, _ = default()
  sheets_client = gspread.authorize(creds)
  spreadsheet = sheets_client.open_by_key(SPREADSHEET_ID)

  input_feed_values = spreadsheet.worksheet(INPUT_SHEET_NAME).get_all_values()
  input_feed_data = pd.DataFrame(
      input_feed_values[1:], columns=input_feed_values[0])


### Google Cloud BigQuery

In [None]:
#@title Configurable parameters { run: 'auto' }

#@markdown Enter the Google Cloud Project ID associated with your BigQuery data:
GCP_BIGQUERY_PROJECT_ID = "gcp-project-id-goes-here" # @param {type:"string"}

#@markdown Enter the SQL query you would like to execute to pull data from BigQuery
#@markdown (*Expand this cell to view a sample query for the [Google Merchant Center Data Transfer](https://cloud.google.com/bigquery/docs/merchant-center-transfer)*):
BQ_INPUT_QUERY = "SELECT * FROM `dataset.table` WHERE column = \"value\" ORDER BY column" #@param {type:"string"}

# Google Ads BigQuery Data Transfer sample query
gcp_mc_transfer_dataset_name = 'merchant_center' # Dataset where the transfer has been stored
gcp_mc_transfer_merchant_id = '0' # Enter the <merchant_id> if you are using an individual Merchant, or the <aggregator_id> if you are using an MCA account.
bg_input_query_sample = f"""
SELECT
  P.product_id AS `Item ID`,
  P.brand AS `Brand`,
  P.title AS `Title`,
  P.description AS `Description`,
  P.color AS `Color`,
  P.material AS `Material`,
  P.link AS `Landing Page`,
  P.image_link AS `Image Link`,
  P.sale_price.value AS `Price`,
FROM
  `{GCP_BIGQUERY_PROJECT_ID}.{gcp_mc_transfer_dataset_name}.Product_{gcp_mc_transfer_merchant_id}` AS P
ORDER BY `Item ID`;
"""

# Validation rules
if IS_BIGQUERY_DATA_SOURCE and (
    not GCP_BIGQUERY_PROJECT_ID or not BQ_INPUT_QUERY):
  raise ValueError(
      'Invalid input! Please make sure at least '
      '"GCP_BIGQUERY_PROJECT_ID" AND "BQ_INPUT_QUERY" '
      'are provided.')


In [None]:
#@title Fetch data from GCP BigQuery
%%bigquery fetched_feed --project $GCP_BIGQUERY_PROJECT_ID
$BQ_INPUT_QUERY

In [None]:
#@title Store the fetched data into a Pandas DataFrame
if IS_BIGQUERY_DATA_SOURCE:
  input_feed_data = fetched_feed

## Preprocessing and Data Overview

In [None]:
#@title Helper functions
import re
from typing import Mapping, Optional, Sequence, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text # do not remove unused import as it's required in runtime

from tqdm import tqdm


tqdm.pandas()
USE_MODEL = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")


def remove_spaces(text: str) -> str:
  return text.replace(' ', '')

def get_missing_semantic_match_features(
    ground_truth_features: Mapping[str, str],
    target_text: str,
    model: tf.Module = USE_MODEL) -> Mapping[str, float]:
  feature_keys = list(ground_truth_features.keys())
  feature_values = list(ground_truth_features.values())
  target_text_words = (
      tf.keras.preprocessing.text.text_to_word_sequence(target_text))
  logging.debug('target_text_words: %r', target_text_words)

  embeddings_features = model(feature_values)
  embeddings_target = model(target_text_words)
  similarity = tf.reduce_sum(
      embeddings_features[:, tf.newaxis] * embeddings_target, axis=-1)
  similarity = tf.math.divide(similarity, tf.norm(
      embeddings_features[:, tf.newaxis], axis=-1) * tf.norm(
          embeddings_target, axis=-1))

  indices = tf.math.argmax(similarity, axis=1).numpy()
  max_similarity = tf.math.reduce_max(similarity, axis=1).numpy()

  feature_cosine_similarity_words_and_score = {}

  for i in range(len(feature_keys)):
    feature_cosine_similarity_words_and_score[feature_keys[i]] = (
        feature_values[i], target_text_words[indices[i]], max_similarity[i])

  logging.debug(
      'feature_cosine_similarity_words_and_score: %r',
      feature_cosine_similarity_words_and_score)

  return {
      key: cosine_similarity for key, (original, _, cosine_similarity) in feature_cosine_similarity_words_and_score.items()
      if cosine_similarity < FEED_ITEM_FEATURE_SIMILARITY_THRESHOLD}

def get_missing_features_in_text(
    item_row: pd.Series,
    text: str,
    main_features: Sequence[str] = FEED_ITEM_MAIN_FEATURES) -> Tuple[str, str]:
  exact_match_features = {}
  semantic_match_features = {}
  text = text.lower()

  for feature in main_features:
    feature_value = str(item_row[feature]).lower()
    if feature in FEED_ITEM_EXACT_MATCH_FEATURES:
      exact_match_features[feature] = feature_value
    else:
      semantic_match_features[feature] = feature_value

  logging.debug('exact_match_features: %r', exact_match_features)
  logging.debug('semantic_match_features: %r', semantic_match_features)

  missing_exact_match_features = [
      feature for feature, feature_value in exact_match_features.items()
      if len(feature_value) == 0 or (
          feature_value not in text and remove_spaces(feature_value) not in remove_spaces(text))]
  logging.debug('Missing exact match features: %r', missing_exact_match_features)

  missing_semantic_match_features = get_missing_semantic_match_features(
      ground_truth_features=semantic_match_features,
      target_text=text)
  logging.debug(
      'Missing semantic match features: %r', missing_semantic_match_features)

  missing_feature_keys = (
      missing_exact_match_features + list(missing_semantic_match_features.keys()))
  missing_feature_similarity_scores = (
      [0 for i in range(len(missing_exact_match_features))] + list(missing_semantic_match_features.values()))

  logging.debug(
      'missing_feature_keys: %r', missing_feature_keys)
  logging.debug(
      'missing_feature_similarity_scores: %r', missing_feature_similarity_scores)

  return (
      ', '.join(missing_feature_keys),
      ', '.join([f'{value:.2f}' for value in missing_feature_similarity_scores]))

def calculate_feature_missing_count(
    item_row: pd.Series,
    item_key: str,
    feature: str) -> int:
  feature_value = str(item_row[feature]).lower()
  text = str(item_row[item_key]).lower()

  if feature in FEED_ITEM_EXACT_MATCH_FEATURES:
    return 1 if len(feature_value) == 0 or (
        feature_value not in text and remove_spaces(feature_value) not in remove_spaces(text)) else 0
  else:
    res = get_missing_semantic_match_features({feature: feature_value}, text)
    return 1 if feature in res else 0

def calculate_feature_missing_counts(
    item_key: str,
    main_features: Sequence[str] = FEED_ITEM_MAIN_FEATURES) -> Tuple[Mapping[str, int], Sequence[str]]:
  feature_missing_counts_dict = {}
  features_to_remove = []

  for feature in main_features:
    logging.info(
        'Evaluating input for the main feature: "%s" and column: "%s"...',
        feature,
        item_key)
    feature_missing = pd.DataFrame()
    feature_missing[feature] = input_feed_data.progress_apply(lambda row: calculate_feature_missing_count(row, item_key, feature), axis=1)
    count_feature_missing = sum(feature_missing[feature])

    if count_feature_missing == len(input_feed_data):
      logging.warning(
          'WARNING! The feature "%s" is missing in all values of the '
          'column "%s", yet is defined in "FEED_ITEM_MAIN_FEATURES". '
          'This feature will be removed from the set of main features.',
          feature, item_key)
      features_to_remove.append(feature)
    else:
      feature_missing_counts_dict[feature] = count_feature_missing

  main_features = [
      feature for feature in main_features
      if feature not in features_to_remove]
  if not main_features:
    raise ValueError(
      'Invalid input! All "FEED_ITEM_MAIN_FEATURES" have been removed as '
      'they do not exist in the provided input. '
      'Please correct your input data and try again.')
  return feature_missing_counts_dict, main_features


In [None]:
#@title Calculate validation metrics for the input data
from tqdm import tqdm
tqdm.pandas()

features_missing_in_titles, main_features = calculate_feature_missing_counts(FEED_ITEM_TITLE)
features_missing_in_descriptions, main_features = calculate_feature_missing_counts(FEED_ITEM_DESCRIPTION, main_features)

logging.info(
    'Identifying missing exact and semantic features for the column: "%s"...',
    FEED_ITEM_TITLE)
input_feed_data[[
    FEED_OUTPUT_MISSING_FEATURES_IN_TITLE,
    FEED_OUTPUT_MISSING_FEATURES_TITLE_SIMILARITY_SCORE,
]] = input_feed_data.progress_apply(
    lambda row: get_missing_features_in_text(row, row[FEED_ITEM_TITLE], main_features),
    axis='columns',
    result_type='expand')

logging.info(
    'Identifying missing exact and semantic features for the column: "%s"...',
    FEED_ITEM_DESCRIPTION)
input_feed_data[[
    FEED_OUTPUT_MISSING_FEATURES_IN_DESCRIPTION,
    FEED_OUTPUT_MISSING_FEATURES_DESCRIPTION_SIMILARITY_SCORE,
]] = input_feed_data.progress_apply(
    lambda row: get_missing_features_in_text(row, row[FEED_ITEM_DESCRIPTION], main_features),
    axis='columns',
    result_type='expand')

logging.info('Creating an overview of quality issues in the input feed...')
total_input_rows = len(input_feed_data)
input_feed_summary = pd.DataFrame()
input_feed_summary[FEED_SUMMARY_LOW_QUALITY_TITLES] = [round(len(
    input_feed_data[input_feed_data[FEED_OUTPUT_MISSING_FEATURES_IN_TITLE].str.len() > 0]
) / total_input_rows * 100, 2)]
input_feed_summary[FEED_SUMMARY_LOW_QUALITY_DESCRIPTIONS] = [round(len(
    input_feed_data[input_feed_data[FEED_OUTPUT_MISSING_FEATURES_IN_DESCRIPTION].str.len() > 0]
) / total_input_rows * 100, 2)]
input_feed_summary[FEED_SUMMARY_MISSING_DESCRIPTIONS] = [round(len(
    input_feed_data[input_feed_data[FEED_ITEM_DESCRIPTION] == input_feed_data[FEED_ITEM_TITLE]]
) / total_input_rows * 100, 2)]
input_feed_summary[FEED_SUMMARY_SHORT_DESCRIPTIONS] = [round(len(
    input_feed_data[input_feed_data[FEED_ITEM_DESCRIPTION].str.len() < FEED_ITEM_DESCRIPTION_LENGTH_THRESHOLD]
) / total_input_rows * 100, 2)]

if FEED_SORT_METRIC and FEED_SORT_METRIC in input_feed_data.columns:
  input_feed_data.sort_values(by=FEED_SORT_METRIC, inplace=True)
else:
  input_feed_data['missing_features_count'] = input_feed_data[
      FEED_OUTPUT_MISSING_FEATURES_IN_TITLE].str.len() + input_feed_data[
          FEED_OUTPUT_MISSING_FEATURES_IN_DESCRIPTION].str.len()
  input_feed_data.sort_values(
        by='missing_features_count', ascending=False, inplace=True)
  input_feed_data.drop(columns='missing_features_count', inplace=True)

FEED_ITEM_MAIN_FEATURES = main_features


In [None]:
#@title Preview the fetched data
input_feed_data.head()

In [None]:
#@title Display an overview of the quality issues in the input feed
input_feed_summary.head()

## Inference

FeedGen supports the use of both foundational models from Google Cloud as well as models that have been fine-tuned on proprietary data, and applies prompt-tuning to generate titles and descriptions respectively. Refer to this [guide](https://cloud.google.com/vertex-ai/docs/generative-ai/text/text-overview) for an overview of prompt design, and this [guide](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-models) for how to fine-tune and deploy custom models.

In [None]:
#@title Configurable parameters { run: 'auto' }

#@markdown Enter the language in which you would like to generate titles and descriptions:
GENERATION_LANGUAGE = "English" #@param ["English", "French", "German"] {allow-input: true}

#@markdown <hr>Google Cloud Vertex PaLM settings

#@markdown Enter the Google Cloud Project ID where the Vertex API is enabled:
GCP_VERTEX_PROJECT_ID = "gcp-project-id-goes-here" #@param {type:"string"}

#@markdown Enter the Google Cloud Project location where the Vertex API is configured to run:
GCP_VERTEX_LOCATION = "gcp-project-location-goes-here" #@param {type:"string"}

#@markdown Enter the desired Vertex model name to use (refer to the guide provided [here](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models) for an overview of available model names and types).
#@markdown <br>If you have opted to [fine-tune](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-models) a model with your own data, enter the `model_resource_name` value (`projects/<project_number>/locations/<region>/models/<model_id>`) instead:
VERTEX_MODEL_NAME = "google/text-bison@001" #@param {type:"string"}
IS_FINE_TUNED_MODEL = VERTEX_MODEL_NAME.startswith('projects/')

#@markdown <hr>Title optimisation settings

#@markdown Input the additional features you would like to use in the prompt for generating titles, beyond what is already defined in `FEED_ITEM_MAIN_FEATURES`:
TITLE_PROMPT_FEATURES = "Description" #@param {type:"string"}
TITLE_PROMPT_FEATURES = convert_comma_seperated_str_to_list(TITLE_PROMPT_FEATURES)
TITLE_PROMPT_FEATURES = FEED_ITEM_MAIN_FEATURES + [
    feature for feature in TITLE_PROMPT_FEATURES
    if feature and feature not in FEED_ITEM_MAIN_FEATURES]

#@markdown Choose the desired prompt type:
TITLE_PROMPT_TYPE = "FEW_SHOT" #@param ["ZERO_SHOT", "FEW_SHOT"]
IS_TITLE_PROMPT_FEW_SHOT = TITLE_PROMPT_TYPE == 'FEW_SHOT'
DEFAULT_MAX_FEW_SHOT_PROMPT_ITEMS = 5

#@markdown **For Few-Shot prompting**: Enter the best *n* (e.g. 5, max 10) `FEED_ITEM_ID` samples from your input.
#@markdown These samples will be used for prompt-tuning to teach the LLM to generate similar titles.
#@markdown <br>Leave empty to have FeedGen automatically select the top 5 high-quality items for you:
TITLE_FEW_SHOT_TOP_ITEM_IDS = "" #@param {type:"string"}
TITLE_FEW_SHOT_TOP_ITEM_IDS = convert_comma_seperated_str_to_list(TITLE_FEW_SHOT_TOP_ITEM_IDS)

#@markdown Refer to this [guide](https://cloud.google.com/vertex-ai/docs/generative-ai/text/test-text-prompts) for more information on these parameters:
TITLE_TEMPERATURE = 0.2 #@param {type:"number"}
TITLE_MAX_OUTPUT_TOKENS = 40 #@param {type:"integer"}
TITLE_TOP_K = 1 #@param {type:"integer"}
TITLE_TOP_P = 0 #@param {type:"number"}

#@markdown <hr>Description generation settings

#@markdown Input the additional features you would like to use in the prompt for generating descriptions, beyond what is already defined in `FEED_ITEM_MAIN_FEATURES`:
DESCRIPTION_PROMPT_FEATURES = "Title, Highlights" #@param {type:"string"}
DESCRIPTION_PROMPT_FEATURES = convert_comma_seperated_str_to_list(DESCRIPTION_PROMPT_FEATURES)
DESCRIPTION_PROMPT_FEATURES = FEED_ITEM_MAIN_FEATURES + [
    feature for feature in DESCRIPTION_PROMPT_FEATURES
    if feature and feature not in FEED_ITEM_MAIN_FEATURES]

#@markdown Choose the desired prompt type:
DESCRIPTION_PROMPT_TYPE = "ZERO_SHOT" #@param ["ZERO_SHOT", "FEW_SHOT"]
IS_DESCRIPTION_PROMPT_FEW_SHOT = DESCRIPTION_PROMPT_TYPE == 'FEW_SHOT'

#@markdown **For Few-Shot prompting**: Enter the best *n* (e.g. 5, max 10) `FEED_ITEM_ID` samples from your input.
#@markdown These samples will be used for prompt-tuning to teach the LLM to generate similar descriptions.
#@markdown <br>Leave empty to have FeedGen automatically select the top 5 high-quality items for you:
DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS = "" #@param {type:"string"}
DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS = convert_comma_seperated_str_to_list(DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS)

#@markdown Refer to this [guide](https://cloud.google.com/vertex-ai/docs/generative-ai/text/test-text-prompts) for more information on these parameters:
DESCRIPTION_TEMPERATURE = 0.2 #@param {type:"number"}
DESCRIPTION_MAX_OUTPUT_TOKENS = 256 #@param {type:"integer"}
DESCRIPTION_TOP_K = 40 #@param {type:"integer"}
DESCRIPTION_TOP_P = 0.8 #@param {type:"number"}

# Validation rules
if not GCP_VERTEX_PROJECT_ID or not GCP_VERTEX_LOCATION or not VERTEX_MODEL_NAME:
  raise ValueError(
      'Invalid input! Please make sure at least '
      '"GCP_VERTEX_PROJECT_ID" AND "GCP_VERTEX_LOCATION" AND "VERTEX_MODEL_NAME" '
      'are provided.')
if IS_TITLE_PROMPT_FEW_SHOT and len(TITLE_FEW_SHOT_TOP_ITEM_IDS) > 10:
  raise ValueError(
    'Invalid input! Too many "TITLE_FEW_SHOT_TOP_ITEM_IDS" were provided. '
    'Please include a maximum of 10 items to avoid running into API token limits.')
if IS_DESCRIPTION_PROMPT_FEW_SHOT and len(DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS) > 10:
  raise ValueError(
    'Invalid input! Too many "DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS" were provided. '
    'Please include a maximum of 10 items to avoid running into API token limits.')


In [None]:
#@title Intialize the model and configurable prompts
from google.cloud import aiplatform
from vertexai.preview.language_models import TextGenerationModel

aiplatform.init(project=GCP_VERTEX_PROJECT_ID, location=GCP_VERTEX_LOCATION)

if IS_FINE_TUNED_MODEL:
  available_tuned_models = TextGenerationModel.list_tuned_model_names()
  if VERTEX_MODEL_NAME in available_tuned_models:
    model = TextGenerationModel.get_tuned_model(VERTEX_MODEL_NAME)
  else:
    raise ValueError(
        f"The provided model name: '{VERTEX_MODEL_NAME}' is not in the list of "
        f'available tuned models: {available_tuned_models}. Please enter a '
        'valid model name or change the associated GCP project and/or location.'
    )
else:
  model = TextGenerationModel.from_pretrained(VERTEX_MODEL_NAME)

title_prompt = [
    f'Generate a Google Shopping ad title in 15 words or less in {GENERATION_LANGUAGE} using the given product information.',
    f'Include all of the following features in the generated title: "{FEED_ITEM_MAIN_FEATURES}".',
    'Product information:',
]
title_prompt.extend(f'{column_name}: "{{{column_name}}}".' for column_name in TITLE_PROMPT_FEATURES)
title_prompt = '\n'.join(title_prompt)

title_prompt_few_shot = (
  f'{title_prompt}\n'
  f'The generated title is: {{{FEED_ITEM_TITLE}}}'
)
title_gen_model_params = {
  'temperature': TITLE_TEMPERATURE,
  'max_output_tokens': TITLE_MAX_OUTPUT_TOKENS,
  'top_k': TITLE_TOP_K,
  'top_p': TITLE_TOP_P,
}

description_prompt = [
    f'Generate a detailed description in {GENERATION_LANGUAGE} for a product that has the following information:',
]
description_prompt.extend(f'{column_name}: "{{{column_name}}}".' for column_name in DESCRIPTION_PROMPT_FEATURES)
description_prompt = '\n'.join(description_prompt)

description_prompt_few_shot = (
  f'{description_prompt}\n'
  f'The generated description is: {{{FEED_ITEM_DESCRIPTION}}}'
)
description_gen_model_params = {
  'temperature': DESCRIPTION_TEMPERATURE,
  'max_output_tokens': DESCRIPTION_MAX_OUTPUT_TOKENS,
  'top_k': DESCRIPTION_TOP_K,
  'top_p': DESCRIPTION_TOP_P,
}


In [None]:
#@title Helper functions
import re
from typing import Mapping, Optional, Sequence, Tuple

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_text # do not remove unused import as it's required in runtime


def generate_prompt(
    base_prompt: str,
    item_row: pd.Series,
) -> str:
  generated_prompt = base_prompt
  for key in item_row.keys().to_list():
    generated_prompt = generated_prompt.replace(
      f'{{{key}}}', str(item_row[key]))
  generated_prompt = re.sub(r'\{.*?\}', '', generated_prompt)
  return generated_prompt

def generate_few_shot_prompt(
    base_prompt: str,
    item_row: pd.Series,
    few_shot_item_rows: pd.DataFrame,
) -> str:
  prompt_few_shot = []
  for _, few_shot_item_row in few_shot_item_rows.iterrows():
    prompt_few_shot.append(generate_prompt(base_prompt, few_shot_item_row))
  prompt_few_shot.append(generate_prompt(base_prompt, item_row))
  return '\n\n'.join(prompt_few_shot)

def generate_text(
    base_prompt: str,
    item_row: pd.Series,
    few_shot_item_rows: Optional[pd.DataFrame],
    model_generation_params: Mapping[str, str],
) -> str:

  # Generate prompt from item
  if few_shot_item_rows is not None and not few_shot_item_rows.empty:
    prompt = generate_few_shot_prompt(base_prompt, item_row, few_shot_item_rows)
  else:
    prompt = generate_prompt(base_prompt, item_row)

  logging.debug('Prompt: [%s]\nModel params: [%r]', prompt, model_generation_params)
  response = model.predict(prompt, **model_generation_params)
  logging.debug('Response: [%s]', response.text)

  return response.text

def generate_and_evaluate_text(
    base_prompt: str,
    item_row: pd.Series,
    item_key: str,
    few_shot_item_rows: Optional[pd.DataFrame],
    model_generation_params: Mapping[str, str],
    main_features: Sequence[str] = FEED_ITEM_MAIN_FEATURES,
    regeneration_max_retries: int = 3,
    regeneration_temperature_step: float = 0.2,
    regeneration_top_k_step: int = 10,
    regeneration_top_p_step: float = 0.2,
) -> Tuple[str, str, str, str]:
  reduced_item_row = item_row.drop(labels=[item_key])
  min_missing_features_count = None
  generated_variants = {}
  output_dict = {}

  for i in range(0, regeneration_max_retries + 1):
    generation_params = model_generation_params.copy()
    if i >= 1:
      if item_key == FEED_ITEM_DESCRIPTION:
        generation_params['temperature'] += (i * regeneration_temperature_step)
      else:
        generation_params['top_k'] += (i * regeneration_top_k_step)
        generation_params['top_p'] += (i * regeneration_top_p_step)
    generated_text = generate_text(base_prompt, reduced_item_row, few_shot_item_rows, generation_params)
    if generated_text:
      missing_features, features_cosine_similarity = get_missing_features_in_text(item_row, generated_text, main_features)
    else:
      missing_features = ', '.join(main_features)
      features_cosine_similarity = ', '.join(['0.00' for _ in range(len(main_features))])
    missing_features, features_cosine_similarity = get_missing_features_in_text(item_row, generated_text, main_features)
    missing_features_list = list(filter(bool, missing_features.split(', ')))

    generated_variants[f'VARIANT_{i+1}'] = {
        'generated_text': generated_text,
        'missing_features': missing_features,
        'missing_features_similarity': features_cosine_similarity,
        'model_params': str(generation_params),
    }
    if not min_missing_features_count or len(missing_features_list) < min_missing_features_count:
      min_missing_features_count = len(missing_features_list)
      output_dict = generated_variants[f'VARIANT_{i+1}']
    if min_missing_features_count == 0:
      break

  needs_approval = len(output_dict['missing_features']) > 0

  return (
      output_dict['generated_text'],
      output_dict['missing_features'],
      output_dict['missing_features_similarity'],
      output_dict['model_params'],
      str(generated_variants),
      FEED_OUTPUT_STATUS_NEEDS_APROVAL if needs_approval else FEED_OUTPUT_STATUS_PRE_APPROVED)


In [None]:
#@title Perform inference for titles and descriptions
from tqdm import tqdm
tqdm.pandas()

max_few_shot_items = min(DEFAULT_MAX_FEW_SHOT_PROMPT_ITEMS, len(input_feed_data))
best_quality_items = input_feed_data.iloc[-max_few_shot_items:][FEED_ITEM_ID].to_list()

if IS_TITLE_PROMPT_FEW_SHOT and not TITLE_FEW_SHOT_TOP_ITEM_IDS:
  TITLE_FEW_SHOT_TOP_ITEM_IDS = best_quality_items
if IS_DESCRIPTION_PROMPT_FEW_SHOT and not DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS:
  DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS = best_quality_items

title_few_shot_products = (
    None if not IS_TITLE_PROMPT_FEW_SHOT or not TITLE_FEW_SHOT_TOP_ITEM_IDS
    else input_feed_data[input_feed_data[FEED_ITEM_ID].isin(TITLE_FEW_SHOT_TOP_ITEM_IDS)]
)
description_few_shot_products = (
    None if not IS_DESCRIPTION_PROMPT_FEW_SHOT or not DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS
    else input_feed_data[input_feed_data[FEED_ITEM_ID].isin(DESCRIPTION_FEW_SHOT_TOP_ITEM_IDS)]
)

logging.info('Generating titles for the given input...')
input_feed_data[[
    FEED_OUTPUT_GENERATED_TITLE,
    FEED_OUTPUT_MISSING_FEATURES_IN_GENERATED_TITLE,
    FEED_OUTPUT_GENERATED_TITLE_SIMILARITY_SCORE,
    FEED_OUTPUT_GENERATED_TITLE_MODEL_PARAMS,
    FEED_OUTPUT_GENERATED_TITLE_VARIANTS,
    FEED_OUTPUT_GENERATED_TITLE_STATUS]] = input_feed_data.progress_apply(
        lambda row: generate_and_evaluate_text(
            base_prompt=title_prompt_few_shot if IS_TITLE_PROMPT_FEW_SHOT else title_prompt,
            item_row=row,
            item_key=FEED_ITEM_TITLE,
            few_shot_item_rows=title_few_shot_products,
            model_generation_params=title_gen_model_params),
        axis='columns',
        result_type='expand')

logging.info('Generating descriptions for the given input...')
input_feed_data[[
    FEED_OUTPUT_GENERATED_DESCRIPTION,
    FEED_OUTPUT_MISSING_FEATURES_IN_GENERATED_DESCRIPTION,
    FEED_OUTPUT_GENERATED_DESCRIPTION_SIMILARITY_SCORE,
    FEED_OUTPUT_GENERATED_DESCRIPTION_MODEL_PARAMS,
    FEED_OUTPUT_GENERATED_DESCRIPTION_VARIANTS,
    FEED_OUTPUT_GENERATED_DESCRIPTION_STATUS]] = input_feed_data.progress_apply(
        lambda row: generate_and_evaluate_text(
            base_prompt=description_prompt_few_shot if IS_DESCRIPTION_PROMPT_FEW_SHOT else description_prompt,
            item_row=row,
            item_key=FEED_ITEM_DESCRIPTION,
            few_shot_item_rows=description_few_shot_products,
            model_generation_params=description_gen_model_params),
        axis='columns',
        result_type='expand')

input_feed_data.head()

## Output

Make a copy of the following [template](https://docs.google.com/spreadsheets/d/1Ro91GhHaurph5zaqgr4n1PDqFZwuln-jpwam3irYq5k/edit#gid=1221408551) Google Sheets spreadsheet, then paste the ID and run the cells below to output the generated data. Read the instructions in the **Getting Started** worksheet to familiarise yourself with how to use the sheet, particularly how it can be used to set up a supplemental feed in Merchant Center.

In [None]:
#@title Configurable parameters { run: 'auto' }

#@markdown Enter the output spreadsheet ID:
OUTPUT_SPREADSHEET_ID = "template-sheet-id-goes-here" #@param {type:"string"}
OUTPUT_GENERATED_WORKSHEET_NAME = 'Generated'
OUTPUT_SUMMARY_WORKSHEET_NAME = 'Summary'

# Validation rules
if not OUTPUT_SPREADSHEET_ID:
  raise ValueError(
      'Invalid input! Please make sure "OUTPUT_SPREADSHEET_ID" is provided.')

In [None]:
#@title Write the generated data to the output Google Sheets spreadsheet
import gspread
from gspread_dataframe import set_with_dataframe
from google.auth import default

creds, _ = default()
sheets_client = gspread.authorize(creds)
output_spreadsheet = sheets_client.open_by_key(OUTPUT_SPREADSHEET_ID)

fixed_order_columns = [
    FEED_OUTPUT_ITEM_ID,
    FEED_OUTPUT_GENERATED_TITLE_STATUS,
    FEED_OUTPUT_GENERATED_DESCRIPTION_STATUS,
    FEED_OUTPUT_TITLE,
    FEED_OUTPUT_GENERATED_TITLE,
    FEED_OUTPUT_DESCRIPTION,
    FEED_OUTPUT_GENERATED_DESCRIPTION,

    FEED_OUTPUT_MISSING_FEATURES_IN_TITLE,
    FEED_OUTPUT_MISSING_FEATURES_TITLE_SIMILARITY_SCORE,
    FEED_OUTPUT_MISSING_FEATURES_IN_GENERATED_TITLE,
    FEED_OUTPUT_GENERATED_TITLE_SIMILARITY_SCORE,
    FEED_OUTPUT_GENERATED_TITLE_MODEL_PARAMS,
    FEED_OUTPUT_GENERATED_TITLE_VARIANTS,

    FEED_OUTPUT_MISSING_FEATURES_IN_DESCRIPTION,
    FEED_OUTPUT_MISSING_FEATURES_DESCRIPTION_SIMILARITY_SCORE,
    FEED_OUTPUT_MISSING_FEATURES_IN_GENERATED_DESCRIPTION,
    FEED_OUTPUT_GENERATED_DESCRIPTION_SIMILARITY_SCORE,
    FEED_OUTPUT_GENERATED_DESCRIPTION_MODEL_PARAMS,
    FEED_OUTPUT_GENERATED_DESCRIPTION_VARIANTS,
]
input_feed_data.rename(columns={
    f'{FEED_ITEM_ID}': FEED_OUTPUT_ITEM_ID,
    f'{FEED_ITEM_TITLE}': FEED_OUTPUT_TITLE,
    f'{FEED_ITEM_DESCRIPTION}': FEED_OUTPUT_DESCRIPTION}, inplace=True)
output_feed_data = input_feed_data[
    fixed_order_columns +
    [col for col in input_feed_data if col not in fixed_order_columns]
]
output_feed_data.sort_values(
    by=[FEED_OUTPUT_GENERATED_TITLE_STATUS,
        FEED_OUTPUT_GENERATED_DESCRIPTION_STATUS], inplace=True)

try:
  output_sheet = output_spreadsheet.worksheet(OUTPUT_GENERATED_WORKSHEET_NAME)
  output_sheet.clear()
except gspread.exceptions.WorksheetNotFound:
  output_sheet = output_spreadsheet.add_worksheet(
      OUTPUT_GENERATED_WORKSHEET_NAME,
      rows=len(output_feed_data),
      cols=len(output_feed_data.columns))

set_with_dataframe(
      output_sheet, output_feed_data, include_column_header=True)

try:
  summary_sheet = output_spreadsheet.worksheet(OUTPUT_SUMMARY_WORKSHEET_NAME)
  summary_sheet.clear()
except gspread.exceptions.WorksheetNotFound:
  summary_sheet = output_spreadsheet.add_worksheet(
      OUTPUT_SUMMARY_WORKSHEET_NAME,
      rows=len(input_feed_summary),
      cols=len(input_feed_summary.columns))

set_with_dataframe(
      summary_sheet, input_feed_summary, include_column_header=True)

### How was the sample data generated?

All data in the template Google Sheets spreadsheet was generated using the `Faker` library as shown by the cells below.

In [None]:
#@title Install dependencies
!pip install Faker

In [None]:
#@title Generate sample data with Faker
#@markdown The generated data format resembles that of a typical Google Merchant Center Shopping feed.
import random

import pandas as pd
from faker import Faker

SIZES = ['XS', 'S', 'M', 'L', 'XL']
DESCRIPTION_MAX_LENGTHS = [400, 700, 1000]

fake = Faker(['en'])
rows = [['Item ID', 'Title', 'Description', 'Brand', 'Size', 'Color', 'Clicks']]

for i in range (1, 501):
  random_brand = fake.company()
  random_size = fake.random_element(elements=SIZES)
  random_color = fake.safe_color_name()
  random_description_max_length = fake.random_element(
      elements=DESCRIPTION_MAX_LENGTHS)
  title = ', '.join([
       '' if i % 3 == 0 else random_brand,
       fake.catch_phrase(),
       '' if i % 4 == 0 else random_size,
       '' if i % 5 == 0 else random_color,
  ])
  description = ', '.join([
      '' if i % 6 == 0 else random_brand,
      fake.text(max_nb_chars=random_description_max_length),
  ])
  row = [
      i,
      title,
      title if i % 7 == 0 else description,
      random_brand,
      random_size,
      random_color,
      random.randint(1, 10000),
  ]
  rows.append(row)

generated_feed = pd.DataFrame(rows[1:], columns=rows[0])
generated_feed.head()

In [None]:
#@title Output to csv
#@markdown The file will be saved in the default 'home' folder on Colab (`/content/`) and can be downloaded from there.
generated_feed.to_csv('/content/faker_sample.csv', index=False)