# Generate parquet output with PySpark

Parquet is a columnar-format with desirable properties for larger datasets. We may for example only be interested in a subset of columns. Parquet allows for reading individual columns without having to parse every line.

I'll use PySpark to do this, for my personal reference on installing Spark and adjusting memory usage on a Kaggle instance.

In [None]:
! pip install pyspark

In [None]:
from pathlib import Path
from pyspark.sql import SparkSession


spark = (
    SparkSession.builder
    .config("spark.driver.memory", "12g")
    .getOrCreate()
)
spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism*2)
spark.conf.get("spark.driver.memory")


prefix = "../input/riiid-test-answer-prediction"
for path in Path(prefix).glob("*.csv"):
    name = path.name.split(".")[0]
    print(f"writing {path} to {name}")
    %time df = spark.read.csv(path.as_posix(), header=True, inferSchema=True)
    df.printSchema()
    
    if "train" in path.name:
        # dataset is 5.7GB, so each partition should be ~1GB with 4 parts
        %time df.repartitionByRange(4, "user_id", "timestamp").write.parquet(name, mode="overwrite")
    else:
        %time df.repartition(1).write.parquet(name, mode="overwrite")