# Previous Module 
* PubMed Astracts from 5 common dieases catogories were fatched and output to .json files. 
* These 5 dieases catogories are cancer, chronic, cardiovascular, neurological, infectious.  

In [9]:
import spacy
import requests
import os
from dotenv import load_dotenv
import json
import time

# Load PubMed Abstracts 
Created generator to load those PubMed Abstracts

In [2]:
def load_json_files(files, path):
    for filename in files:
        file_loc = os.path.join(path, filename)
        with open(file_loc, 'r', encoding='utf-8') as f:
            yield json.load(f)

# Text Cleanup
* Annotated datasets are necessary for Bert model finetuning. 
* For thosejson files, I only kept the text bodys of abstracts and remove title, author, comments et. al. 
* Publicly available NER corpus are often on one aspects or multiple categories but not what I expected. When using the chatbox interface of a couple of LLM web services, I observed that the annotation process appeared to be relatively simple. Considering affordbility, I chose  DeepSeek API to annote diseases, genes and chemicals in these abstracts. However, this API doesn't perform well on long text.
* Finally, I decided to split all text by sentence and let the API to annotate one sentence at a time. 

In [3]:
def clean_text(raw_abstract):
    author_info_index = None
    text = raw_abstract.split('\n\n')
    
    for i, s in enumerate(text):
        if s.startswith('Author information:'):
            author_info_index = i
            break
            
    if author_info_index is not None:
        abstract_body = text[author_info_index + 1]
        if abstract_body.startswith(('DOI', 'Comment', 'Publisher', 'BACKGROUND', 'RECENT')):
            return None
        else: return abstract_body
    else:
        return None

nlp = spacy.load("en_core_web_sm")
def split_by_sentences(text):    
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

def sentences_generator(data):    
    for stext in data:        
        inputs = stext['abstract']
        abs_body = clean_text(inputs)
        if abs_body is not None:
            text_chunks = split_by_sentences(abs_body)
            clean_abs = [t.replace("\n", " ") for t in text_chunks]
            yield clean_abs
        else:
            pass
            


# Text Annotation
* Engineer a clear prompt is the key to produce correct annotation
* Used generator for optimal performance

In [17]:
def text_annotation(text, max_retries=3, retry_delay=2):
    prompt = f"""
            TASK: Perform biomedical named entity recognition (NER) and relation extraction on the PubMed abstract.
            Return ONLY a valid JSON object for the following biomedical text analysis. Do not include any other text, explanations, or markdown formatting.
            
            TEXT TO PROCESS:
            {text}
            
            INSTRUCTIONS:
            INSTRUCTIONS:
            1. TOKENIZATION:
               - Split text into tokens (whitespace-based)
               
            
            2. ENTITY ANNOTATION (BIO scheme):
               - Entity types: gene/protein (B-gene), disease (B-disease), chemical (B-chemical)
               - Use "B-" for first token, "I-" for continuation tokens
               - Annotate multi-word entities consistently
               - Assign "0" for non-entity tokens
            
            3. RELATION EXTRACTION:
               - Relations between adjacent entities only (max distance=3 tokens)
               - Valid relation types: "causes", "treats", "regulates", "associated_with", "protein_of"
               - Evidence must be either "explicit" or "implicit"
            
            
            OUTPUT REQUIREMENTS:
            - Strictly valid JSON only (no Markdown, no comments)
            - Escape all special characters
            - No trailing commas
            - Maintain original token order

            OUTPUT FORMAT:
            {{
              "tokens": ["token1", "token2"],
              "ner_tags": ["B-gene", "0"],
              "relations": [
                {{
                  "head": 0,
                  "tail": 3,
                  "type": "regulates",
                  "evidence": "explicit"
                }}
              ]
            }}
            
            IMPORTANT:
            - If uncertain about an entity/relation, omit it
            - Prioritize precision over recall
            - Return ONLY the JSON object with no additional text
            """

    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "system", "content": "Return ONLY valid JSON. No explanations."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.3,
        "max_tokens": 4096,
        "response_format": {"type": "json_object"}
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
            response.raise_for_status()  # Raise HTTP errors (4xx/5xx)
            result = response.json()
            return result["choices"][0]["message"]["content"]
        except (requests.RequestException, json.JSONDecodeError, KeyError) as e:
            if attempt == max_retries - 1:
                print(f"Failed after {max_retries} retries for text (first 50 chars): '{text[:50]}...' | Error: {str(e)}")
                return None
            time.sleep(retry_delay * (attempt + 1))  # Exponential backoff

def annotate_text_generator(text_chunks):
    for i, text in enumerate(text_chunks):
        print(f"Processing text {i+1}/{len(text_chunks)}...")
        annotation_json = text_annotation(text)  # Your annotation function
        yield json.loads(annotation_json)

In [19]:
file_path = '/your/file/path/data_ouputs'
all_files = os.listdir(file_path)

In [6]:
load_dotenv('/your/file/path/.env')
API_KEY = os.getenv("DEEPSEEK_API_KEY")
API_URL = "https://api.deepseek.com/v1/chat/completions"

In [None]:
from pathlib import Path

output_file = Path('annotations_gen.jsonl')
output_file.unlink(missing_ok=True)  # Clear previous runs 

data_gen = load_json_files(all_files[2:], file_path)

with output_file.open('a', encoding='utf-8') as f:
    for data in data_gen:
        try:
            for sentence in sentences_generator(data):
                for annotation in annotate_text_generator(sentence):
                    f.write(json.dumps(annotation, ensure_ascii=False) + '\n')
        except Exception as e:
            print(f"Error processing {data[0]}: {str(e)}")
            continue

# Next: PubMedBert Finetune
I took advanage of the colab notebook GPU and demo the Finetune here: https://colab.research.google.com/drive/1Fi5rBEabFVF3TlTHqee_-EvYR8Ag00Ak#scrollTo=Zg1CdIIPE4W2