In [12]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
from tqdm.notebook import tqdm

In [20]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
spark = (
    SparkSession.builder
        .appName("TestTransformFile")
        .config("spark.sql.shuffle.partitions", "4")
        .getOrCreate()
    )

In [17]:
def convert_csv_to_parquet(csv_file_name, output=True):
    with tqdm(total=3) as pbar:
        csv_path = f"./work/data/{csv_file_name}"
        parquet_dir_name = f"{csv_file_name.split('.')[0]}.parquet"
        parquet_path = f"./work/data/{parquet_dir_name}"
        pbar.update()
        
        if output:
            print(
                csv_file_name,
                round(os.path.getsize(csv_path) / 1024 /1024, 2),
                "MB"
            )
        
        data_csv = spark.read.csv(csv_path, header=True, inferSchema=True)
        pbar.update()
        data_csv.write.parquet(parquet_path, mode='overwrite')
        pbar.update()
        
        if output:
            print(
                parquet_dir_name + "/",
                round(
                    sum(
                        os.path.getsize(f"{parquet_path}/{f}") 
                        for f in os.listdir(f"{parquet_path}")
                        if os.path.isfile(f"{parquet_path}/{f}")
                    ) / 1024 / 1024,
                    2
                ),
                "MB"
            )

In [18]:
convert_csv_to_parquet("customers-100000.csv", output=True)

  0%|          | 0/3 [00:00<?, ?it/s]

customers-100000.csv 17.17 MB
customers-100000.parquet/ 9.29 MB


In [19]:
convert_csv_to_parquet("orders-from-customers-100000.csv", output=True)

  0%|          | 0/3 [00:00<?, ?it/s]

orders-from-customers-100000.csv 29.23 MB
orders-from-customers-100000.parquet/ 17.14 MB
