In [None]:
from huggingface_hub import login
login("*****")  # Replace with your actual token


In [None]:
from datasets import load_dataset, concatenate_datasets

# 1. Load Turkish dataset
turkish_ds = load_dataset("alibayram/doktorsitesi")
turkish_train = turkish_ds["train"]
turkish_test = turkish_ds["test"]

print(f"Turkish train size: {len(turkish_train)}")
print(f"Turkish test size: {len(turkish_test)}")

# 2. Load MedQA (lavita) dataset - only test split available
medqa_ds = load_dataset("lavita/medical-qa-datasets", "chatdoctor-icliniq")
medqa_test = medqa_ds["test"]

print(f"MedQA test size: {len(medqa_test)}")

# Format MedQA test split to match columns
medqa_test_formatted = medqa_test.rename_column("input", "question_content").rename_column("answer_chatdoctor", "question_answer")
medqa_test_formatted = medqa_test_formatted.add_column("doctor_title", [""] * len(medqa_test_formatted))
medqa_test_formatted = medqa_test_formatted.add_column("doctor_speciality", [""] * len(medqa_test_formatted))

# 3. Load MedQA-USMLE dataset
mcqa_ds = load_dataset("GBaker/MedQA-USMLE-4-options")
mcqa_train = mcqa_ds["train"]
mcqa_test = mcqa_ds["test"] if "test" in mcqa_ds else None

print(f"MedQA-USMLE train size: {len(mcqa_train)}")

Turkish train size: 150105
Turkish test size: 37527
MedQA test size: 7321
MedQA-USMLE train size: 10178


In [None]:
!pip install deep_translator




In [None]:
from deep_translator import GoogleTranslator

def translate_text(texts):
    # texts: list of strings
    translations = []
    for text in texts:
        try:
            translated = GoogleTranslator(source='en', target='tr').translate(text)
        except Exception as e:
            print(f"Translation error: {e}")
            translated = text  # fallback to original if error
        translations.append(translated)
    return translations


In [None]:
# Translate question_content and question_answer from English to Turkish
questions_en = mcqa_train['question']
answers_en = mcqa_train['answer']

questions_tr = translate_text(questions_en)
answers_tr = translate_text(answers_en)

# Create new dataset with translated columns + add empty doctor info
import datasets

mcqa_train_translated = datasets.Dataset.from_dict({
    "doctor_title": [""] * len(mcqa_train),
    "doctor_speciality": [""] * len(mcqa_train),
    "question_content": questions_tr,
    "question_answer": answers_tr
})


In [None]:
questions_en = medqa_test['input']
answers_en = medqa_test['answer_chatdoctor']

questions_tr = translate_text(questions_en)
answers_tr = translate_text(answers_en)

medqa_test_translated = datasets.Dataset.from_dict({
    "doctor_title": [""] * len(medqa_test),
    "doctor_speciality": [""] * len(medqa_test),
    "question_content": questions_tr,
    "question_answer": answers_tr
})


Translation error: Hello doctor,There is a feeling of little itching inside my vagina. I noticed something weird inside. --> No translation was found using the current translator. Try another translator?


In [None]:
merged_train = concatenate_datasets([turkish_train, mcqa_train_translated, medqa_test_translated])
print(f"Merged train size: {len(merged_train)}")


Merged train size: 167604


In [None]:
from huggingface_hub import login

login(token="***")


In [None]:
merged_train.push_to_hub("ahmadtab/merged_medical_qa_dataset_translated")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/168 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ahmadtab/merged_medical_qa_dataset_translated/commit/9be808106ddab8f92b65c31d16239382083d2ebe', commit_message='Upload dataset', commit_description='', oid='9be808106ddab8f92b65c31d16239382083d2ebe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ahmadtab/merged_medical_qa_dataset_translated', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ahmadtab/merged_medical_qa_dataset_translated'), pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset, DatasetDict, concatenate_datasets

# Load merged dataset from HF (train only)
merged_train = load_dataset("ahmadtab/merged_medical_qa_dataset_translated", split="train")

# Load Turkish test split
turkish_test = load_dataset("alibayram/doktorsitesi", split="test")

# Create DatasetDict with train and new test
final_dataset = DatasetDict({
    "train": merged_train,
    "test": turkish_test
})
merged_
# Print sizes
print("Train size:", len(final_dataset["train"]))
print("Test size:", len(final_dataset["test"]))

# Push updated dataset
final_dataset.push_to_hub("ahmadtab/merged_medical_qa_dataset_tr_with_test")


Train size: 167604
Test size: 37527


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/168 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/38 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ahmadtab/merged_medical_qa_dataset_tr_with_test/commit/ce51125515a52bf8a3509b583017ab0e57d8dad7', commit_message='Upload dataset', commit_description='', oid='ce51125515a52bf8a3509b583017ab0e57d8dad7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ahmadtab/merged_medical_qa_dataset_tr_with_test', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ahmadtab/merged_medical_qa_dataset_tr_with_test'), pr_revision=None, pr_num=None)