# Copycat Demo

This is a demo of the copycat AI.

## Git-on-Borg set-up
If this is first time using GoB in this runtime; go to https://professional-services.googlesource.com/new-password and follow the steps. Eventually you should be shown a box with some bash code in it. Copy and paste the code into the cell below, replacing the text `[YOUR AUTH SCRIPT GOES HERE]`. Then run that cell.


In [None]:
%%bash
[YOUR AUTH SCRIPT GOES HERE]

## Install copcat through pip

In [None]:
%pip install 'git+https://professional-services.googlesource.com/solutions/copycat#egg=copycat&subdirectory=py'

In [None]:
import copycat

from IPython.display import Markdown, display
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import gspread
from google.colab import auth
from google.cloud import aiplatform
from google.auth import default
import vertexai


In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
tqdm.pandas()

# Authenticate Vertex AI and gSpread

Complete

1.   **_PROJECT_ID** with the GCP Project ID where you want to use VertexAI (make sure you have the vertexAI and Spreadsheets API enabled)
2.   **_LOCATION** with the [location](https://cloud.google.com/vertex-ai/docs/general/locations#available-regions) where to run vertexAI

If you run into ADC authentication errors then it might be required to re-authenticate GCP credentials in your runtime. Uncomment and run the cell below to resolve this.

In [None]:
# !gcloud auth application-default login --scopes 'https://www.googleapis.com/auth/cloud-platform','https://www.googleapis.com/auth/spreadsheets','https://www.googleapis.com/auth/drive'

In [None]:
auth.authenticate_user()

_PROJECT_ID = ""  # @param {type:"string"}
_LOCATION = ""  # @param {type:"string"}

vertexai.init(project=_PROJECT_ID, location=_LOCATION)
creds, _ = default(scopes=[
    'https://www.googleapis.com/auth/spreadsheets',
    'https://www.googleapis.com/auth/drive'
]
)
GSPREAD_CLIENT = gspread.authorize(creds)


# Load data from google sheets

First I load in the input data that has been downloaded from google ads reports.
For this, enter the URLs and sheet names for the google sheets containing the
data for each report. If the google sheets only have a single tab, you don't
need to set the sheet name, it will take the first sheet by default. The linked
sheet here is the
[CopyCat Data Template](https://docs.google.com/spreadsheets/d/1u-Ex25XkVKFzn0r0LuxROeAtbGsOzlnmU15PzHignXw/edit?gid=0#gid=0)

In [None]:
def load_data_from_google_sheet(
    url: str, worksheet_name: str, client: gspread.Client, skip_rows: int = 0
) -> pd.DataFrame:
  """Loads data from a Google Sheet to pandas dataframe."""
  input_sheet = client.open_by_url(url)

  if worksheet_name:
    values = input_sheet.worksheet(worksheet_name).get_all_values()
  else:
    values = input_sheet.sheet1.get_all_values()

  return pd.DataFrame.from_records(
      values[skip_rows + 1 :], columns=values[skip_rows]
  )


def write_data_to_google_sheet(
    sheet_name: str, data: pd.DataFrame, client: gspread.Client
) -> gspread.Worksheet:
  """Writes data to a Google Sheet."""
  values = data.values.tolist()
  column_names = data.columns.tolist()
  values.insert(0, column_names)
  worksheet = client.create(sheet_name)

  column_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  worksheet.sheet1.update(
      f"A1:{column_letters[len(values[0])-1]}{len(values)+1}", values
  )
  return worksheet


ad_report_url = "https://docs.google.com/spreadsheets/d/1u-Ex25XkVKFzn0r0LuxROeAtbGsOzlnmU15PzHignXw/edit?gid=1482290088#gid=1482290088"  # @param {type:"string"}
ad_report_sheet_name = "existing_ads"  # @param {type:"string"}
search_keyword_report_url = "https://docs.google.com/spreadsheets/d/1u-Ex25XkVKFzn0r0LuxROeAtbGsOzlnmU15PzHignXw/edit?gid=1482290088#gid=1482290088"  # @param {type:"string"}
search_keyword_report_sheet_name = "existing_keywords"  # @param {type:"string"}
new_keywords_url = "https://docs.google.com/spreadsheets/d/1u-Ex25XkVKFzn0r0LuxROeAtbGsOzlnmU15PzHignXw/edit?gid=1482290088#gid=1482290088"  # @param {type:"string"}
new_keywords_sheet_name = "new_keywords"  # @param {type:"string"}

ad_report_data = load_data_from_google_sheet(
    ad_report_url, ad_report_sheet_name, GSPREAD_CLIENT, skip_rows=0
)
search_keyword_report_data = load_data_from_google_sheet(
    search_keyword_report_url,
    search_keyword_report_sheet_name,
    GSPREAD_CLIENT,
    skip_rows=0,
)
new_keywords_data = load_data_from_google_sheet(
    new_keywords_url, new_keywords_sheet_name, GSPREAD_CLIENT, skip_rows=0
)

# Process data

This combines the keywords and the ad copy together, and formats it so that it's
ready for copycat. In the final solution this will be handled by the google
sheet.

In [None]:
clean_ad_report_data = ad_report_data.copy()

headline_cols = [
    c
    for c in clean_ad_report_data.columns
    if c.startswith("Headline") and not c.endswith("position")
]
description_cols = [
    c
    for c in clean_ad_report_data.columns
    if c.startswith("Description") and not c.endswith("position")
]
clean_ad_report_data["headlines"] = pd.Series(
    {
        k: list(
            filter(lambda x: x != "--" and x and not x.startswith("{"), v),
        )
        for k, v in clean_ad_report_data[headline_cols]
        .T.to_dict("list")
        .items()
    },
    index=clean_ad_report_data.index,
)
clean_ad_report_data["descriptions"] = pd.Series(
    {
        k: list(filter(lambda x: x != "--" and x, v))
        for k, v in clean_ad_report_data[description_cols]
        .T.to_dict("list")
        .items()
    },
    index=clean_ad_report_data.index,
)
clean_ad_report_data = clean_ad_report_data.set_index(["campaign", "ad_group"])[
    ["headlines", "descriptions"]
]
clean_ad_report_data = clean_ad_report_data.loc[
    clean_ad_report_data["headlines"].apply(len) > 0
]

print("Unique counts: ")
print(clean_ad_report_data[["headlines", "descriptions"]].astype(str).nunique())

In [None]:
keywords_data = search_keyword_report_data[
    ["campaign", "ad_group", "keywords"]
].copy()
keywords_data["keywords"] = keywords_data["keywords"].str.strip('[]"')
keywords_data = keywords_data.drop_duplicates()
keywords_data = keywords_data.set_index(["campaign", "ad_group"])
keywords_data = keywords_data.loc[keywords_data["keywords"] != ""]

print(f"{keywords_data['keywords'].nunique()} unique keyword lists")

In [None]:
train_data = pd.merge(
    keywords_data,
    clean_ad_report_data,
    how="inner",
    left_index=True,
    right_index=True,
)


print(
    f"{len(train_data):,} unique combiations of keywords, headlines and"
    " descriptions"
)

In [None]:
test_data = new_keywords_data.drop_duplicates()
print(f"{len(test_data):,} new lists of keywords")

In [None]:
train_data.head()

In [None]:
test_data.head()

# Copycat Parameters

In [None]:
COMPANY_NAME = "YOUR_BRAND"  # @param {type:"string"}
USE_STYLE_GUIDE = False  # @param {type:"boolean"}
EMBEDDING_MODEL_NAME = "text-embedding-004"  # @param {type:"string"}
AD_FORMAT = "responsive_search_ad"  # @param {type:"string"}
LANGUAGE = "english"  # @param {type:"string"}

NUM_IN_CONTEXT_EXAMPLES = 5  # @param {type:"integer"}
CHAT_MODEL_NAME = "gemini-1.5-pro-preview-0514" # @param ["gemini-1.5-flash-preview-0514", "gemini-1.0-pro", "gemini-1.5-pro-preview-0514"]
TEMPERATURE = 1  # @param {type:"slider", min:0, max:2, step:0.01}
TOP_K = 40  # @param {type:"integer"}
TOP_P = 0.95  # @param {type:"number"}

# Style Guide Generation

Copycat's distintive performance mainly comes from it's ability to leverage a
style guide to generate the ad copies and stick close to and advertsiers style.

To generate a style guide:

1.  Export the training data to a google sheet (run the first cell below).
2.  Navigate to
    [AI Studio](https://aistudio-preprod.corp.google.com/app/prompts/new_chat)
3.  On the top right, select `gemini 1.5 pro`.
4.  Copy and paste the following prompt in the chat box, replace YOUR_BRAND with
    your brand's name:

*This is an Ad report for YOUR_BRAND. Use this information to write a
comprehensive style guide for this brand's ad copies that can serve as
instruction for another model to generate ad copies for YOUR_BRAND by only
serving it keywords and this guide. Ensure that you capure strong phrases,
slogans and brand names of YOUR_BRAND in the guide.*

1.  Use the + right of the chat box to add the training data (from google drive)
    and any other style or branding guides you might have.

2.  Run the model and copy its response by selecting the 3 dots on the top right
    of the response message and copy the markdown and paste it below in
    `style_guide`

3.  Feel free to make any changes to the style guide to better match your brand

In [None]:
if USE_STYLE_GUIDE:
  style_guide_prompt = """
  Below is an ad report for {company_name}, containing their ads (headlines and descriptions)
  that they use on Google Search Ads for the corresponding keywords. Headlines and descriptions are lists, and
  Google constructs ads by combining those headlines and descriptions together into ads. Therefore the headlines and descriptions
  should be sufficiently varied that Google is able to try lots of different combinations in order to find what works best.

  Use the ad report to write a comprehensive style guide for this brand's ad copies that can serve as
  instruction for a copywriter to write new ad copies for {company_name} for new lists of keywords.

  Ensure that you capure strong phrases, slogans and brand names of {company_name} in the guide.
  \n\n{style_guide}
  """.format(
      company_name=COMPANY_NAME,
      style_guide=train_data.reset_index(drop=True).astype(str).to_csv(),
  )

  response = copycat.ad_copy_generator.generative_models.GenerativeModel(
      "gemini-1.5-pro-preview-0514",
      generation_config={
          "temperature": 0.95,
          "top_k": 20,
          "top_p": 0.95,
      },
  ).generate_content(style_guide_prompt)

  style_guide = response.candidates[0].content.parts[0].text

  display(Markdown(style_guide))
else:
  style_guide = ""

# Copycat Generation

First we set up the copycat model. This will process the existing ads and put
them into a vectorstore, so they can be easily retrieved.

In [None]:
model = copycat.Copycat.create_from_pandas(
    training_data=train_data,
    embedding_model_name=EMBEDDING_MODEL_NAME,
    ad_format=AD_FORMAT,
    on_invalid_ad="skip",
)

First lets look at how the prompt looks. Here is an example:

In [None]:
display(Markdown(
  model.construct_text_generation_requests_for_new_ad_copy(
      keywords=[test_data["keywords"].values[0]],
      system_instruction_kwargs = dict(
          company_name=COMPANY_NAME,
          language=LANGUAGE,
      ),
      style_guide=style_guide,
      num_in_context_examples=NUM_IN_CONTEXT_EXAMPLES,
      model_name=CHAT_MODEL_NAME,
      temperature=TEMPERATURE,
      top_k=TOP_K,
      top_p=TOP_P,
  )[0].to_markdown()
))

Now lets see a single response:

In [None]:
response = model.generate_new_ad_copy(
    keywords=[test_data["keywords"].values[0]],
    keywords_specific_instructions=[
        test_data["keywords_instructions"].values[0]
    ],
    system_instruction_kwargs=dict(
        company_name=COMPANY_NAME,
        language=LANGUAGE,
    ),
    style_guide=style_guide,
    num_in_context_examples=NUM_IN_CONTEXT_EXAMPLES,
    model_name=CHAT_MODEL_NAME,
    temperature=TEMPERATURE,
    top_k=TOP_K,
    top_p=TOP_P,
)[0]

display(Markdown(str(response.google_ad)))
if not response.success:
  print(response.error_message)

If that looks good, let's generate for all the test data. You can select the
batch size to control how many ads to generate in parallel. If the batch size is
large then you will process the data faster, but you may hit quota limits.

In [None]:
BATCH_SIZE = 15  # @param {type:"integer"}

generated_ads = []
n_batches = np.ceil(len(test_data) / BATCH_SIZE)

for keywords_batch in tqdm(np.array_split(test_data, n_batches), desc="Batch"):
  generated_ads.extend(
      model.generate_new_ad_copy(
          keywords=keywords_batch["keywords"].values.tolist(),
          keywords_specific_instructions=keywords_batch[
              "keywords_instructions"
          ].values.tolist(),
          system_instruction_kwargs=dict(
              company_name=COMPANY_NAME,
              language=LANGUAGE,
          ),
          style_guide=style_guide,
          num_in_context_examples=NUM_IN_CONTEXT_EXAMPLES,
          model_name=CHAT_MODEL_NAME,
          temperature=TEMPERATURE,
          top_k=TOP_K,
          top_p=TOP_P,
      )
  )
  incomplete_keywords = [
      idx
      for idx, response in enumerate(generated_ads)
      if len(response.google_ad.headlines) < model.ad_format.max_headlines
      or len(response.google_ad.descriptions) < model.ad_format.max_descriptions
  ]
test_data["generated_ad_object"] = pd.Series(
    generated_ads, index=test_data.index
)

In [None]:
def _add_existing_ad_to_instructions(
    row: pd.Series,
    max_headlines: int = max_headlines,
    max_descriptions: int = 4,
) -> str:
  instructions = row["keyword_instructions"]
  headlines = row["generated_ad_object"].google_ad.headlines
  descriptions = row["generated_ad_object"].google_ad.descriptions

  n_existing_headlines = len(headlines)
  n_existing_descriptions = len(descriptions)
  n_required_headlines = max_headlines - n_existing_headlines
  n_required_descriptions = max_descriptions - n_existing_descriptions

  if (0 < n_required_headlines < max_headlines) and (
      0 < n_required_descriptions < max_descriptions
  ):
    instructions += (
        f"\n\nThis ad already has {n_existing_headlines} headlines and"
        f" {n_existing_descriptions} descriptions. You only need to add"
        f" {n_required_headlines} headlines and"
        f" {n_required_descriptions} descriptions to complete the ad. The"
        " existing headlines and descriptions are: \n- headlines:"
        f" {headlines}\n- descriptions: {descriptions}"
    )
  elif (0 < n_required_headlines < max_headlines) and (
      n_required_descriptions == 0
  ):
    instructions += (
        f"\n\nThis ad already has {n_existing_headlines} headlines and"
        f" {n_existing_descriptions} descriptions. You only need to add"
        f" {n_required_headlines} headlines to complete the ad. You do not need"
        " to generate any descriptions, as there are enough already. The"
        " existing headlines and descriptions are: \n- headlines:"
        f" {headlines}\n- descriptions: {descriptions}"
    )
  elif (0 < n_required_headlines < max_headlines) and (
      n_existing_descriptions == 0
  ):
    instructions += (
        f"\n\nThis ad already has {n_existing_headlines} headlines. You only"
        f" need to add {n_required_headlines} headlines to complete the ad, as"
        f" well as {n_required_descriptions} descriptions. The existing"
        f" headlines are: \n- headlines: {headlines}"
    )
  elif (n_required_headlines == 0) and (
      0 < n_required_descriptions < max_descriptions
  ):
    instructions += (
        f"\n\nThis ad already has {n_existing_headlines} headlines and"
        f" {n_existing_descriptions} descriptions. You only need to add"
        f" {n_required_descriptions} descriptions to complete the ad. You do"
        " not need to generate any headlines, as there are enough already. The"
        " existing headlines and descriptions are: \n- headlines:"
        f" {headlines}\n- descriptions: {descriptions}"
    )
  elif (n_required_headlines == max_headlines) and (
      0 < n_required_descriptions < max_descriptions
  ):
    instructions += (
        f"\n\nThis ad already has {n_existing_descriptions} descriptions. You"
        f" only need to add {n_required_descriptions} descriptions to complete"
        f" the ad, as well as {n_required_headlines} headlines. The existing"
        f" descriptions are: \n- descriptions: {descriptions}"
    )

  return instructions

In [None]:
test_data["is_complete"] = test_data["generated_ad_object"].apply(
    lambda x: model.ad_copy_evaluator.is_complete(x.google_ad)
)
fraction_complete = test_data["is_complete"].mean()
n_retries = 0
while fraction_complete < 1.0:
  n_retries += 1
  print(f"{n_retries} / 5: Fraction complete {fraction_complete:.1%}")

  incomplete_rows = test_data.loc[~test_data["is_complete"]].copy()

  extended_incomplete_ads = []

  for keywords_batch in tqdm(
      np.array_split(incomplete_rows, n_batches), desc="Batch"
  ):
    extended_incomplete_ads.extend(
        model.generate_new_ad_copy(
            keywords=keywords_batch["keywords"].values.tolist(),
            keywords_specific_instructions=keywords_batch[
                "keywords_instructions"
            ].values.tolist(),
            system_instruction_kwargs=dict(
                company_name=COMPANY_NAME,
                language=LANGUAGE,
            ),
            style_guide=style_guide,
            num_in_context_examples=NUM_IN_CONTEXT_EXAMPLES,
            model_name=CHAT_MODEL_NAME,
            temperature=TEMPERATURE,
            top_k=TOP_K,
            top_p=TOP_P,
        )
    )

  extended_incomplete_ads = pd.Series(
      extended_incomplete_ads, index=incomplete_rows.index
  )
  for index, extended_incomplete_ad in extended_incomplete_ads.items():
    test_data.loc[index, "generated_ad_object"].google_ad.headlines.extend(
        extended_incomplete_ad.google_ad.headlines
    )
    test_data.loc[index, "generated_ad_object"].google_ad.descriptions.extend(
        extended_incomplete_ad.google_ad.descriptions
    )
    copycat.ad_copy_generator.remove_invalid_headlines_and_descriptions(
        test_data.loc[index, "generated_ad_object"].google_ad, model.ad_format
    )

  test_data["is_complete"] = test_data["generated_ad_object"].apply(
      lambda x: model.ad_copy_evaluator.is_complete(x.google_ad)
  )
  fraction_complete = test_data["is_complete"].mean()

  if n_retries >= 5:
    break

print(f"Fraction complete: {fraction_complete:.1%}")

In [None]:
for response in test_data["generated_ad_object"].values:
  print("")
  print(response.keywords)
  display(Markdown(str(response.google_ad)))
  if not response.success:
    print(response.error_message)

Finally we export back to google sheets. This will be default export to the
google sheet where the new keywords were taken from, but a different tab names
"Copycat generated sample". Make sure that sheet exists otherwise this wont
work.

In [None]:
def extract_headlines(response):
  return pd.Series(
      response.google_ad.headlines,
      index=[
          f"Headline {i+1}" for i in range(len(response.google_ad.headlines))
      ],
  )


def extract_descriptions(response):
  return pd.Series(
      response.google_ad.descriptions,
      index=[
          f"Description {i+1}"
          for i in range(len(response.google_ad.descriptions))
      ],
  )


headlines = (
    test_data["generated_ad_object"].apply(extract_headlines).fillna("--")
)
descriptions = (
    test_data["generated_ad_object"].apply(extract_descriptions).fillna("--")
)

results = (
    test_data[["keywords"]]
    .copy()
    .merge(headlines, left_index=True, right_index=True)
    .merge(descriptions, left_index=True, right_index=True)
    .reset_index()
)
results

In [None]:
results_sheet_name = "Copycat Generated Sample"  # @param {type:"string"}

worksheet = GSPREAD_CLIENT.open_by_url(new_keywords_url)

column_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

values = results.values.tolist()
values.insert(0, results.columns.values.tolist())
worksheet.worksheet(results_sheet_name).update(
    f"A1:{column_letters[len(values[0])-1]}{len(values)+1}", values
)

print(f"Treatment assignment stored in google sheet: {worksheet.url}")