# Transforming JSON to Parquet
This notebook implements and tests the conversion from JSON to Parquet format to reduce the size of the dataset.

---

## Imports

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
import os

## Function Implementation

In [2]:
def transform_JSON_to_parquet(json_file_path, parquet_dir, parquet_file_name, batch_size=1000):
    """
    Transforms the JSON file into a Parquet file in batches.

    Parameters:
    - json_file_path (str): Absolute path to the JSON file.
    - parquet_dir (str): Directory to store the output Parquet file.
    - parquet_file_name (str): Name of the output Parquet file.
    - batch_size (int, optional): Number of lines to process in each batch. Default is 1000.

    Returns:
    None
    """
    try:
        # Open the JSON file
        with open(json_file_path, 'r') as f:
            batch_data = []
            batch_count = 0

            for line in f:
                batch_data.append(json.loads(line))
                if len(batch_data) == batch_size:
                    # Process the current batch
                    process_batch(batch_data, parquet_dir, parquet_file_name, batch_count)
                    batch_data = []
                    batch_count += 1

            # Process any remaining data
            if batch_data:
                process_batch(batch_data, parquet_dir, parquet_file_name, batch_count)

        print(f"All batches processed successfully from {json_file_path}.")
    except FileNotFoundError:
        print(f"Error: The file {json_file_path} was not found.")
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON. Details: {e}")
    except Exception as e:
        print(f"An unexpected error occurred while loading the JSON file: {e}")

def process_batch(batch_data, parquet_dir, parquet_file_name, batch_count):
    """
    Processes a batch of data and appends it to the Parquet file.

    Parameters:
    - batch_data (list): List of JSON objects in the current batch.
    - parquet_dir (str): Directory to store the output Parquet file.
    - parquet_file_name (str): Name of the output Parquet file.
    - batch_count (int): The current batch number.

    Returns:
    None
    """
    try:
        # Create a DataFrame from the batch data
        df = pd.DataFrame(batch_data)
        print(f"DataFrame for batch {batch_count} created successfully.")

        # Convert DataFrame to Arrow Table
        table = pa.Table.from_pandas(df)
        print(f"Arrow Table for batch {batch_count} created successfully.")

        # Construct the file path for the Parquet file
        parquet_file_path = os.path.join(parquet_dir, parquet_file_name)

        # Check if the Parquet file already exists
        if os.path.exists(parquet_file_path):
            # Read the existing Parquet file
            existing_table = pq.read_table(parquet_file_path)
            # Combine the existing data with the new batch
            combined_table = pa.concat_tables([existing_table, table])
            # Write the combined data back to the Parquet file
            pq.write_table(combined_table, parquet_file_path)
        else:
            # Write the new batch to the Parquet file
            pq.write_table(table, parquet_file_path)
        print(f"Batch {batch_count} written successfully to {parquet_file_path}")

    except ValueError as e:
        print(f"Error: Could not create DataFrame from batch data: {e}")
        raise
    except pa.lib.ArrowInvalid as e:
        print(f"Error: Arrow Table creation failed for batch {batch_count}: {e}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred while writing batch {batch_count} to the Parquet file: {e}")
        raise

## Function Testing

In [3]:
base_dir = r"C:\Users\Fatema Kotb\Documents\CUFE 25\Year 04\Fall\CMPS454 Natural Language Processing\Fatema\7. Group Project\NLP-Project\0 Datasets\provided_datasets"

json_file_path = os.path.join(base_dir, "PIZZA_dev.json")
parquet_dir = base_dir
parquet_file_name = "PIZZA_dev.parquet"

transform_JSON_to_parquet(json_file_path, parquet_dir, parquet_file_name, batch_size=100000)

DataFrame for batch 0 created successfully.
Arrow Table for batch 0 created successfully.
Batch 0 written successfully to C:\Users\Fatema Kotb\Documents\CUFE 25\Year 04\Fall\CMPS454 Natural Language Processing\Fatema\7. Group Project\NLP-Project\0 Datasets\provided_datasets\PIZZA_dev.parquet
All batches processed successfully from C:\Users\Fatema Kotb\Documents\CUFE 25\Year 04\Fall\CMPS454 Natural Language Processing\Fatema\7. Group Project\NLP-Project\0 Datasets\provided_datasets\PIZZA_dev.json.
