## Songshu Annotation via LLM (Google Gemini)

In [None]:
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

import regex as re
import time
from pathlib import Path

In [None]:
# Get API key from aistudio.google.com, provided you have a Google account
GOOGLE_AI_STUDIO = ''  # <-- input your API key
genai.configure(api_key=GOOGLE_AI_STUDIO)

In [None]:
# List all available Google AI models accessible with the API key
for m in genai.list_models():
    print(m.name, '\t\t', m.supported_generation_methods)


In [None]:
# Set up model parameters
generation_config = {
  "temperature": 0.0,
  "top_p": 1,
  "top_k": 1,
  "max_output_tokens": 128 * 1024,   # 128k; different LLMs have different values; check model documentation
}

# All safety settings are set to None, because I don't want any censorship
safety_settings={
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

model_name = "gemini-1.5-pro-latest"    # <-- this is supposedly the best Google model (but more expensive)
model_name = "gemini-1.5-flash-latest"  # <-- this is a fast (but less accurate) model
model_name = "gemini-1.5-pro-exp-0801"  # <-- this is currently (Aug. 2024-08) in preview mode

# Enter high-level instructions (system prompt/instruction) here (and uncomment the 'system_instruction' below)
SYSTEM_PROMPT = '''
[to be determined]
'''

model = genai.GenerativeModel(model_name=model_name,
                              #system_instruction=SYSTEM_PROMPT,
                              generation_config=generation_config,
                              safety_settings=safety_settings
                             )

model

### Input file to be annotated

In [53]:
# Input file - text to be annotated
fin = 'wudi1of3.txt'
source_text = Path(fin).read_text(encoding='utf-8').strip().split('#####') # The delimiter ##### has been manually added

In [55]:
Path(fin).stem

'wudi1of3'

In [None]:
# Check no. of segments and the content of the 0th segment
len(source_text), source_text[0]

In [None]:
# Check the size (no. of characters) of each segment
for seg in source_text:
    print(len(seg))

In [None]:
# Create a chat session
chat = model.start_chat(history=[])

In [None]:
# Prompt to be sent to LLM via API calls

PROMPT = """
You are an expert in the Classical Chinese language and ancient Chinese history, especially the Liu Song dynasty (5th century). You will be annotating a text enclosed in the <text> tag at the very end of this prompt. This is a multi-step prompt, consisting of 5 steps.

=== STEPS BEGIN ===
All of the following outputs from all steps will be enclosed in a single <segment number='{idx}'> tag.

Step 1:
Please tokenize the Classical Chinese passage into word tokens (a word can be a single-character or multi-character chunk). Simply separate the tokens using a single space. Retain the punctuation marks. Retain the empty line as a visual aid. For this step output the word-tokenized text within the tag <step1>.

Step 2:
Using the output from Step 1 (tokenized text), enclosed each word token with the XML-like tag <ne> (for named entity) only if the token is a named entity (NE), such as personal name, geographic location, date, job title, etc. Do not use a tag for a non-NE. For this step output the annotated text within the tag <step2>. 

Step 3:
Using the output from Step 2, for each identified named entity within the <ne> tag, replace the <ne> tag with one of the following specific tags based on the named entity's classification:
<p> for a personal name;
<g> for a geographical name;
<o> for the name of a political office or job title;
<et> for an emperor's temple name 廟號 (e.g., 高祖);
<ep> for an emperor's posthumous name 諡號 (e.g., 武帝);
<era> for an imperial era name (e.g., 永初);
<dy> for a dynastic name (e.g., 晉, 漢);
<k> for a kinship term, such as 父, 母, 子, 女, 甥, 繼母, 從弟, 從兄, 從子, 從叔, 兄子, 弟子, 伯, 叔, 舅, 祖父, 從嫂, 從叔;
<d> for a date or time.
For this step output the annotated text within the tag <step3>.

Step 4:
Use the the output from Step 3, for each named entity identified as a personal name (<p>), attempt to recover the full name associated with this named entity by adding the surname; place this full name as an attribute to the <p> tag. For example, <p fn="劉裕">裕<p/>, who is the founding emperor of the Liu Song Dynasty 劉裕 with the surnamed 劉. Output the entire annotated text. For a named entity identified as a date, if it is a year, attempt to prefix it with the proper era name, and place this full year designation (e.g., 永初二年, 隆安三年) as an attribute to the <d> tag, e.g., <d fd='永初二年'>二年</d>. Output the annotated text within the tag <step4>.

Step 5:
Provide a summary of the annotations made at the end within a <summary> tag.

=== STEPS END ===

The text to be annotated is:
<text>
{text_to_be_annotated}
</text>
"""

SLEEP_TIME = 30

# output file - (hopefully) annotated
fstem = Path(fin).stem  # retrieve name of input file without file extension
fon = f'{fstem}_annotated.{model_name}.txt'

with open(fon, 'a', encoding='utf-8', newline='\n') as fo:

    for idx, src in enumerate(source_text[0:]):
        #if idx < n: continue   # if the loop stops at Segment n, continue by uncommenting this line and change n to that Segement number 
        if src.startswith('#'): continue  # skip the #### lines
        text_to_be_annotated = src.strip()

        prompt = PROMPT.format(text_to_be_annotated=text_to_be_annotated, idx=idx)
        print(f'Segment {idx}:') 
        print(prompt)

        if idx % 2 == 0:  # clear history for every two chat.send_message() calls 
            chat = model.start_chat(history=[])
        
        start_time = time.time()
        response = chat.send_message(prompt)
        for chunk in response:
            fo.write(chunk.text + '\n')
            #fo.write("_"*80 + '\n')
        fo.flush()
        
        response = chat.send_message('Please continue.')  # if there are too many output tokens, we might need this prompt to force the model to complete the text-generation process
        for chunk in response:
            fo.write(chunk.text + '\n')
            fo.write("_"*80 + '\n')
        fo.flush()
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time for Segment {idx}: {elapsed_time:.2f} seconds")
        
        print('='*25)
        print(f'**** Sleeping {SLEEP_TIME} seconds....')
        time.sleep(SLEEP_TIME)
        print('**** Now continuing...')

print("\n\n***** ALL DONE!!!")