In [None]:
!pip install transformers peft bitsandbytes
!pip install datasets
!pip install openai==0.28
!pip install nltk
!pip install num2words



In [None]:
import os
os.environ['HF_TOKEN'] ="" #provide your HF token

In [None]:
import datasets

from datasets import load_dataset

train_dataset = load_dataset("FreedomIntelligence/DoctorFLAN", split='train')
test_dataset = load_dataset("FreedomIntelligence/DoctorFLAN", split='test')

In [None]:
import openai
openai.api_key = "" # provide your paid key

def translate_text(text, target_language):
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"You are a helpful translator. Translate the following text to {target_language}. Please keep it as close to the original text meaning as possible. Do not shorten or summarize"},
            {"role": "user", "content": text}
        ],
    )

    return response.choices[0].message['content'].strip()


In [None]:
train_dataset.to_csv("train_data.csv", index=False)
test_dataset.to_csv("test_data.csv", index=False)

Creating CSV from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1418431

In [None]:
import pandas as pd
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

In [None]:
df_train = df_train.sample(n = 1700, random_state=42).reset_index(drop=True)
df_test = df_test.sample(n = 200, random_state=42).reset_index(drop=True)

In [None]:
# Before translation
df_train.head(5)

Unnamed: 0,category,input,output,id
0,Definitive Diagnosis,在分析病人的病史、症状、体征和检查结果后，先给出最终的诊断，再解释这一诊断的依据和过程。<病...,根据病人的病史、症状、体征和检查结果，最终的诊断是慢性硬膜下血肿。\n\n这一诊断的依据和过...,30542
1,Disease Inquiry,请提供关于特定疾病的基础知识，包括概述、发病机理、易感人群、是否具有传染性以及发病率。详细介...,原发性心脏淋巴瘤是一种十分罕见的恶性肿瘤，它直接起源于心脏的淋巴组织。根据<书本知识>提供的...,74869
2,Treatment Plan,针对确诊的疾病，提出一个全面的治疗方案，可能包括药物治疗、手术治疗、物理治疗、生活方式调整等...,考虑到患者的年龄、发作的性质以及间隔时间，在没有神经系统阳性发现和正常的脑电图检查的情况下，...,48499
3,Symptom Inquiry,描述症状的诊断过程、检查方法以及可能伴随的其他症状。包括用于诊断症状的医学检查、区分类似症状...,带血粘脓鼻涕的症状可能与多种疾病相关，诊断过程需要综合患者的病史、体检和必要的检查结果。下面...,88564
4,Next Examinations,根据病人的症状和初步诊断，推荐接下来应进行的一系列检查或者检验，以便于进一步确诊。<病例描述...,根据患者的症状和体征，结合其有8年乙肝病史，可以推断患者可能存在乙型肝炎病毒引起的急性或慢性...,23176


In [None]:
# Example of one translation
idx = 0
print("Category: ", df_train.loc[idx, "category"])
print("Input: ", translate_text(df_train.loc[idx, "input"], "English"))
print("Output: ", translate_text(df_train.loc[idx, "output"], "English"))

Category:  Next Examinations
Input:  Based on the patient's symptoms and preliminary diagnosis, it is recommended to conduct a series of further tests or examinations in order to confirm the diagnosis. <Case description>: Male, 25 years old, discovered microscopic hematuria during a physical examination 2 weeks ago, with no other symptoms. To confirm the diagnosis,
Output:  When facing a 25-year-old male patient with asymptomatic microscopic hematuria, we first consider possible kidney or urinary system diseases. For asymptomatic hematuria, morphological (urine red blood cell phase) analysis is indeed an important initial examination because it can help differentiate between renal origin hematuria and non-renal origin hematuria.

Based on the patient's symptoms and textbook knowledge, the recommended examination and test process should include:

1. Urinalysis:
   - Morphological analysis of urine red blood cells (such as urine red blood cell phase): Red blood cells in renal origin hema

In [None]:
import threading
from tqdm import tqdm

def translate_batch(df, idx, target_language):
    for column in ["input", "output"]:
        df.loc[df.index[idx], column] = translate_text(df.iloc[idx][column], target_language)

# Usage
total_rows = 1700
batch_size = 50 #be careful of OpenAI's limits of rate per second

with tqdm(total=total_rows, desc="Translating") as pbar:
    for start_idx in range(0, total_rows, batch_size):
        threads = []

        for i in range(batch_size):
          if start_idx + i < total_rows:
            thread = threading.Thread(target=translate_batch, args=(df_test, start_idx + i, "English"))
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

        pbar.update(min(batch_size, total_rows - start_idx))

Translating: 100%|██████████| 200/200 [02:01<00:00,  1.65it/s]


In [None]:
df_train['input'] = df_train['category'] + ". " + df_train['input']
df_train.drop(columns=['category', 'id'], inplace=True)
df_test['input'] = df_test['category'] + ". " + df_test['input']
df_test.drop(columns=['category', 'id'], inplace=True)

In [None]:
import re
from nltk.tokenize import word_tokenize
from num2words import num2words

def process_text(text):
    text = re.sub(r'\(([^)]+)\)', r' \1 ', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\d+', lambda m: num2words(int(m.group())), text)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?"\'-]', ' ', text)
    return ' '.join(text.split())


# # Apply the function to both 'input' and 'output' columns
df_train['input'] = df_train['input'].apply(process_text)
df_train['output'] = df_train['output'].apply(process_text)
df_test['input'] = df_test['input'].apply(process_text)
df_test['output'] = df_test['output'].apply(process_text)

In [None]:
df_train.head(10)

Unnamed: 0,input,output
0,Definitive Diagnosis. Based on the patient's m...,"Based on the patient's medical history, sympto..."
1,Disease Inquiry. Primary cardiac lymphoma is a...,Primary cardiac lymphoma is a rare form of can...
2,Treatment Plan. Once a disease has been diagno...,"The patient's age, type, and frequency of seiz..."
3,Symptom Inquiry. Diagnosing the symptoms of bl...,The presence of bloody or pus-like nasal disch...
4,Next Examinations. Based on the symptoms and i...,"The patient's symptoms, physical examination f..."
5,"Inquiry Prompts. The symptoms, medical history...",In order to accurately assess the patient's co...
6,Initial Diagnosis. Based on the symptoms and t...,Based on the symptoms and findings from the po...
7,Initial Diagnosis. Based on the symptoms and e...,Based on the information provided and textbook...
8,Initial Diagnosis. Based on the symptoms and t...,The patient has been initially diagnosed with ...
9,"Medication Inquiry. ""You're welcome!""","The ""Calm Mind and Sleep Support Brain Syrup"" ..."


In [None]:
df_test.head(10)

Unnamed: 0,input,output
0,Case Summary. Medical Report Chief Complaint T...,Chief complaint Diarrhea accompanied by a red ...
1,Case Summary. Medical Report Chief Complaint T...,Chief complaint Coughing for several days. Pre...
2,Case Summary. Medical Report Chief Complaint T...,Chief complaint Diarrhea. Present illness hist...
3,Case Summary. Medical Report Chief Complaint T...,Chief complaint Diarrhea for one day. Present ...
4,Case Summary. Medical Report Chief Complaint T...,Chief complaint Fever for three days. Present ...
5,Case Summary. Diagnosis and Treatment Report C...,Chief complaint Jaundice Present illness histo...
6,Case Summary. Medical Report Chief Complaint T...,Chief complaint Diarrhea for one day accompani...
7,Case Summary. Medical Report Chief Complaint A...,Chief complaint Cough for one month Present il...
8,Case Summary. Medical Report Chief Complaint T...,Chief complaint Decreased frequency of bowel m...
9,Case Summary. Medical Report Chief Complaint T...,Chief Complaint Diarrhea for one day. Present ...


In [None]:
json_data = df_train.to_dict(orient='records')

import json

with open('train.json', 'w') as json_file:
    json.dump(json_data, json_file, indent=4)


In [None]:
json_data = df_test.to_dict(orient='records')

import json

with open('test.json', 'w') as json_file:
    json.dump(json_data, json_file, indent=4)
