# Converting the ArabAcquis Corpus to JSON Format
This Colab notebook presents a Python script designed to process a parallel English-Arabic text corpus from the ArabAcquis dataset. It reads line-aligned sentences from separate English and Arabic input files, pairs the corresponding sentences, and then structures this bilingual data into a single, well-formatted JSON output file. This conversion creates an organized and machine-readable dataset, ideal for further Processing tasks.

*   **Input files:**   
    *   `ac_test_en.txt` (The English text file)
    *   `test_en_ref_ar.txt` (The Corresponding Arabic text file)
*   **Output file:**  `ArabAcquis_Original.json`

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/ColabData/ArabAcquis

/content/drive/MyDrive/ColabData/ArabAcquis


In [None]:
import json

def load_parallel_corpus(file_path_en, file_path_ar):
    with open(file_path_en, 'r', encoding='utf-8') as file_en, open(file_path_ar, 'r', encoding='utf-8') as file_ar:
        lines_en = file_en.readlines()
        lines_ar = file_ar.readlines()
    return lines_en, lines_ar

def convert_to_json(lines_en, lines_ar):
    data = []
    for en, ar in zip(lines_en, lines_ar):
        data.append({'english': en.strip(), 'arabic': ar.strip()})
    return data

def save_to_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def main():
    file_path_en = 'ArabAcquis/ac_test_en.txt'  # The path to English text file
    file_path_ar = 'ArabAcquis/test_en_ref_ar.txt'  # The path to Arabic text file
    output_file = 'ArabAcquis/ArabAcquis_Original.json'  # The desired output file path

    lines_en, lines_ar = load_parallel_corpus(file_path_en, file_path_ar)
    data = convert_to_json(lines_en, lines_ar)
    save_to_json(data, output_file)

if __name__ == "__main__":
    main()
