In [None]:
# load input sentences
from pandas import read_json, Series
CTD_RE_V1 = read_json('../../label_studio/export/CTD_RE_v1.json').set_index('id')
sentences = Series(data = [row['text'] for row in CTD_RE_V1.data], index=CTD_RE_V1.index)

In [None]:
# randomly select 2000 sentences as train set for finetuning llama models
sample = sentences.sample(n=2000, random_state=1)
sample

In [None]:
# save the ids of sampled train set
from csv import writer
with open('data/random_2000/sampled_train_ids.csv', 'w', newline='') as outfile:
     wr = writer(outfile)
     wr.writerow(list(sample.index.values))
     outfile.close()

In [None]:
# randomly select 100 other sentences for inference
import random
random.seed(10)
sampled_test_ids = random.sample([i for i in sentences.index if i not in sample.index], 100)

# save the ids of sampled test set
with open('data/random_2000/sampled_test_ids.csv', 'w', newline='') as outfile:
     wr = writer(outfile)
     wr.writerow(sampled_test_ids)
     outfile.close()

In [None]:
from pydantic import BaseModel, Field
from typing import List

class Entity(BaseModel):
    """Data model for an entity."""
    entity_name: str = Field(description = "Entity name of the chemical compound or gene/protein") 
    entity_type: str = Field(description = "Indicates the type of the entity, must be \"Chemical\" or \"Gene/Protein\"")

class Relation(BaseModel):
    """Data model for a relation."""
    subject_entity: Entity = Field(description = "Entity that decribes the subject of the relation and the entity should be chemical compound or gene/protein")
    relation_phrase: str = Field(description = "Relation phrase which must be one of the four relations: increases, decreases, affects and binds")
    object_entity: Entity = Field(description = "Entity that decribes the object of the relation and the entity should be chemical compound or gene/protein")

class Results(BaseModel):
    """Data model for a annotation result."""
    relations: List[Relation] = Field(description = "Relationships between entities that decribe scientific observations and hypothesis")

In [None]:
examples = [

    # task 21001
    {   
        'text': 'Tetrandrine triggered LC3B expression and induced autophagy in CAL 27 cells.',
        'relations': [{"subject_entity": {'entity_name':'Tetrandrine', 'entity_type':'Chemical'},
                       "relation_phrase": 'increases',
                       "object_entity": {'entity_name':'LC3B', 'entity_type':'Gene/Protein'}}]
    },

    # task 21002
    {
        'text': 'Tetrandrine and cepharanthine induce apoptosis through caspase cascade regulation, cell cycle arrest, MAPK activation and PI3K/Akt/mTOR signal modification in glucocorticoid resistant human leukemia Jurkat T cells.',
        'relations': [{"subject_entity": {'entity_name':'Tetrandrine', 'entity_type':'Chemical'},
                       "relation_phrase": 'decreases',
                       "object_entity": {'entity_name':'mTOR', 'entity_type':'Gene/Protein'}},
                       {"subject_entity": {'entity_name':'cepharanthine', 'entity_type':'Chemical'},
                       "relation_phrase": 'decreases',
                       "object_entity": {'entity_name':'mTOR', 'entity_type':'Gene/Protein'}}]
    },

    # task 21011
    {
        'text': 'CONCLUSIONS: These findings are consistent with the idea that theophylline suppresses the production of proinflammatory cytokines via inhibition of NF-kappaB activation through preservation of the IkappaBalpha protein in monocytes/macrophages and T cells.',
        'relations': [{"subject_entity": {'entity_name':'theophylline', 'entity_type':'Chemical'},
                       "relation_phrase": 'affacts',
                       "object_entity": {'entity_name':'IkappaBalpha', 'entity_type':'Gene/Protein'}}]
    },

    # task 21052
    {
        'text': 'We then examined the activity of urokinase-type plasminogen activator (uPA), the rate-limiting enzyme in the TGF-beta2 activation cascade, in t-flavanone-treated human keratinocytes.',
        'relations': [{"subject_entity": {'entity_name':'TGF-beta2', 'entity_type':'Gene/Protein'},
                       "relation_phrase": 'binds',
                       "object_entity": {'entity_name':'urokinase-type plasminogen activator', 'entity_type':'Gene/Protein'}}]
    }
]

In [None]:
# convert examples to previously defined pydantic data model
def toFewShot(example):
    fewshot_example = dict()
    fewshot_example['input text'] = example['text']
    fewshot_example['output'] = Results.model_validate({"relations": example["relations"]}).model_dump_json().replace("{", "{{").replace("}", "}}")
    return fewshot_example

fewshot_examples = [toFewShot(example) for example in examples]

In [None]:

# generate prompt with fewshot examples
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

example_prompt = PromptTemplate(input_variables=["input text", "output"], template="Input text: {input text}\nOutput:\n{output}")
#print(example_prompt.format(**fewshot_examples[0]))

from langchain.output_parsers import PydanticOutputParser

parser = PydanticOutputParser(pydantic_object=Results)
prompt = FewShotPromptTemplate(
    examples=fewshot_examples,
    example_prompt=example_prompt, 
    prefix="""Given an input text sentence, extract the fact relations in the input text.
    {format_instructions}
    """,
    suffix="", 
    input_variables=["input"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

print(prompt.format().replace("{", "{{").replace("}", "}}") + "\n\nInput text: {input}\nOutput:\n")

In [None]:
from llama_index.program import OpenAIPydanticProgram
from llama_index.llms import OpenAI

program = OpenAIPydanticProgram.from_defaults(
    output_cls=Results,
    llm=OpenAI(model="gpt-4"),
    prompt_template_str=prompt.format().replace("{", "{{").replace("}", "}}") + "\n\nInput text:: {input}\nOutput:\n",
    verbose=False,
)

In [None]:
# annotate the train sentences and save output as json files
# 2000 files with 3h40m
from tqdm import tqdm
from json import dumps
for task_id in tqdm(sample.index):
    with open("data/random_2000/output_gpt/task" + str(task_id)+"_gpt_annotation.json", "w") as outfile:
        outfile.write(dumps(program(input = sentences[task_id]).model_dump(), indent=4))

In [None]:
# annotate the train sentences and save output as json files
from langchain_anthropic.experimental import ChatAnthropicTools
from tqdm import tqdm
from json import dumps
# load train and test sentence ids
from csv import reader
file = open("data/random_2000/sampled_train_ids.csv", "r")
sampled_train_ids = list(map(int, list(reader(file, delimiter=","))[0]))
file.close()

# generate results for test sample
file = open("data/random_2000/sampled_test_ids.csv", "r")
sampled_test_ids = list(map(int, list(reader(file, delimiter=","))[0]))
file.close()

chain = ChatAnthropicTools(model="claude-3-opus-20240229").with_structured_output(Results)
for task_id in tqdm(sampled_train_ids):
    with open("data/random_2000/output_claude/task" + str(task_id)+"_claude_annotation.json", "w") as outfile:
        output = chain.invoke(prompt.format().replace("{", "{{").replace("}", "}}") + "\n\n### Input text: " + sentences[task_id] + "\n### Output:\n")
        outfile.write(dumps(output, indent=4))
        outfile.close()

# annotate the test sentences and save output as json files
for task_id in tqdm(sampled_test_ids):
    with open("data/random_2000/test_sample_output/output_claude/task" + str(task_id)+"_claude_annotation.json", "w") as outfile:
        output = chain.invoke(prompt.format().replace("{", "{{").replace("}", "}}") + "\n\n### Input text: " + sentences[task_id] + "\n### Output:\n")
        outfile.write(dumps(output, indent=4))
        outfile.close()