In [None]:
from datetime import datetime, timedelta

# Notebook parameters

In [None]:
sample_day = (datetime.now().date() - timedelta(days=1)).strftime('%Y-%m-%d')
source_folder1 = 'gs://dsart_nearline1/pipelines/bird1/'
target_folder = 'gs://dsart_nearline1/pipelines/digest/'

In [None]:
sample_day

# Dependencies

In [None]:
import os
import pandas
import time
import numpy
import json
import vertexai
from vertexai.generative_models import GenerativeModel
import vertexai.preview.generative_models as generative_models
from dotenv import load_dotenv

# Load data

In [None]:
def clean_column_names(df):
    r = {}
    for x in list(df.columns):
        if '\r' in x:
            r[x] = x.replace('\r', '')
    if len(r)>0:        
        print('renaming', r)
        df.rename(columns=r, inplace=True)

In [None]:
file1 = sample_day+'.csv'
file1_source = source_folder1 + file1
file1_local = sample_day+'_bird1.csv'
file1_source, file1_local

In [None]:
! gsutil cp {file1_source} {file1_local}

In [None]:
df1 = pandas.read_csv(file1_local, lineterminator='\n')

In [None]:
clean_column_names(df1)

In [None]:
df1

## Configure Vertex AI Model

In [None]:
vertexai.init(project="deep-mark-425321-r7", location="us-central1")
vertex_model = GenerativeModel("gemini-1.5-flash-001")

In [None]:
generation_config = {
    "max_output_tokens": 1024,
    "temperature": 1,
    "top_p": 0.95,
}

In [None]:
safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

In [None]:
def call_vertex_ai(prompt):
    responses = vertex_model.generate_content(
      [prompt],
      generation_config=generation_config,
      safety_settings=safety_settings)
    return responses

In [None]:
def extract_json(v):
    v = v.replace(',}', '}')
    if v.find('}')==-1:
        v += '}'
    start = v.find('{')
    end = v.rfind('}')
    return v[start:end+1]

In [None]:
def parse_vertex_response(res):
    try:
        json_string = extract_json(res.text)
        parsed = json.loads(json_string)
        return parsed
    except:
        return None

# Generate digests

In [None]:
top_n = 250

In [None]:
category_counts = dict(df1['category_label'].value_counts())
category_counts

In [None]:
categories = list(df1['category_label'].unique())
categories

In [None]:
links_cols = [
    'id', 'hash', 'fid', 'text', 'topic_label',
    'num_like', 'num_recast', 'num_reply',
    'predict_like'
]

In [None]:
BASE_PROMPT = """
INSTRUCTIONS:
  - Summarize the following posts.
  - Write a catch phrase title.
  - Write 2 or 3 paragraphs to describe the main themes and stories covered in the posts.
  - Use a journalistic style.
  - Include 5 links to reference the most interesting post ids.
  - Output the result in json format.
  - Make sure you don't use " in the json title and summary and avoid invalid json.

RESPONSE FORMAT:
{
  "title": "...catch phrase...",
  "summary": "... 2 or 3 paragraphs ...",
  "links": ["uuid1", "uuid2", "uuid3", "uuid4", "uuid5"]
}

POSTS:
"""
print(BASE_PROMPT)

In [None]:
def make_prompt(df_text):
    prompt = BASE_PROMPT
    for _,row in df_text.iterrows():
        cast_id = row['id']
        cast_text = row['text']
        prompt += "\n"
        prompt += "<"+cast_id+">\n"
        prompt += cast_text+"\n"
        prompt += "</"+cast_id+">\n"
    return prompt

In [None]:
def make_digest(category):
    print('make_digest', category)
    print('before filtering:')
    print(df1[[category,'predict_like']].describe())
    df_tmp = df1[(df1['predict_like']>20)]
    df_tmp = df_tmp.sort_values(category, ascending=False).reset_index(drop=True)
    df_tmp = df_tmp[:top_n]    
    df_tmp = df_tmp.sort_values('timestamp').reset_index(drop=True)
    print('after filtering:')
    print(df_tmp[[category,'predict_like']].describe())
    prompt = make_prompt(df_tmp)
    print('prompt', len(prompt))
    result = call_vertex_ai(prompt)
    digest = parse_vertex_response(result)
    if digest is not None and 'title' in digest and 'summary' in digest and 'links' in digest:
        print('digest')
        print(json.dumps(digest, indent=4))
        links = df_tmp[df_tmp['id'].isin(digest['links'])][links_cols].to_dict(orient='records')
        print('links')
        print(json.dumps(links, indent=4))
        digest['key'] = category
        digest['links'] = links
    else:
        print('bad result')
        print(result.text)
    return digest

In [None]:
data = []

In [None]:
for c in categories:
    print(c+('#'*32))
    digest = make_digest(c)
    if digest is not None:
        data.append(digest)
        time.sleep(15)
    print(c+('#'*32))
    print()
    print()

# Save output

In [None]:
len(data)

In [None]:
output_file = sample_day + '.json'
output_file

In [None]:
with open(output_file, 'w') as fp:
    json.dump(data, fp)

In [None]:
! gsutil cp {output_file} {target_folder}