In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
from tqdm.notebook import tqdm

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
spark = (
    SparkSession.builder
        .appName("TestTransformFile")
        .config("spark.sql.shuffle.partitions", "4")
        .getOrCreate()
    )

In [4]:
def convert_csv_to_other_format(csv_file_name, output=True, format="parquet"):
    with tqdm(total=3) as pbar:
        csv_path = f"./work/data/{csv_file_name}"
        other_file_name = f"{csv_file_name.split('.')[0]}.{format}"
        other_path = f"./work/data/{other_file_name}"
        pbar.update()
        
        if output:
            print(
                csv_file_name,
                round(os.path.getsize(csv_path) / 1024 /1024, 2),
                "MB"
            )
        
        data_csv = spark.read.csv(csv_path, header=True, inferSchema=True)
        pbar.update()
        data_csv.write.format(format).save(other_path, mode='overwrite')
        pbar.update()
        
        if output:
            print(
                other_file_name,
                round(
                    sum(
                        os.path.getsize(f"{other_path}/{f}") 
                        for f in os.listdir(f"{other_path}")
                        if os.path.isfile(f"{other_path}/{f}")
                    ) / 1024 / 1024,
                    2
                ),
                "MB"
            )

In [5]:
convert_csv_to_other_format("customers-100000.csv", output=True, format="parquet")

  0%|          | 0/3 [00:00<?, ?it/s]

customers-100000.csv 17.17 MB
customers-100000.parquet 9.29 MB


In [6]:
convert_csv_to_other_format("orders-from-customers-100000.csv", output=True, format="parquet")

  0%|          | 0/3 [00:00<?, ?it/s]

orders-from-customers-100000.csv 29.23 MB
orders-from-customers-100000.parquet 17.14 MB


In [7]:
convert_csv_to_other_format("customers-100000.csv", output=True, format="orc")

  0%|          | 0/3 [00:00<?, ?it/s]

customers-100000.csv 17.17 MB


customers-100000.orc 5.23 MB


In [8]:
convert_csv_to_other_format("orders-from-customers-100000.csv", output=True, format="orc")

  0%|          | 0/3 [00:00<?, ?it/s]

orders-from-customers-100000.csv 29.23 MB
orders-from-customers-100000.orc 10.25 MB
