# PySpark Repartition

**Libraries and imports**

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

In [4]:
# Check pyspark localtion
pyspark.__file__

'/usr/local/Cellar/apache-spark/3.3.0/libexec/python/pyspark/__init__.py'

## Spark Session

In [5]:
# Create a SparkSession
spark = (
    SparkSession.builder
    .master("local[4]")
    .appName("pyspark-repartition.sandbox")
    .getOrCreate()
    )

22/08/31 15:07:41 WARN Utils: Your hostname, mpb-m1max.local resolves to a loopback address: 127.0.0.1; using 192.168.1.12 instead (on interface en0)
22/08/31 15:07:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/31 15:07:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Repartition Function

In [6]:
# Repartition function
def dataset_repartition(
    spark_session: SparkSession,
    # schema: types.StructType,
    n_partitions: int = 4,
    input_path: str = None,
    output_path: str = None,
) -> None:
    """
    Repartition a specific pyspark DataFrame into n_partitions,
    taking a file path as input and a directory as output

    Args:
        spark_session (pyspark.sql.session.SparkSession): spark session used for the repartition
        n_partition (int): number of partitions for the resulting spark DataFrame
        input_path (str): dir [or file] path for the desired object to be repartitioned
        output_path (str): dir path for the desired localtion where the reapartition will be stored

    Returns:
        None
    """
    if n_partitions < 1:
        raise ValueError("n_partitions must be greater than 0")

    df = spark_session.read.option("header", "true").parquet(input_path)

    df.repartition(n_partitions).write.parquet(output_path, mode="overwrite")
    return df

## Loop over FHV dataset

In [6]:
# Loop over 2021 Oct-Dec and repartition fhv data
year = 2021

for month in range(10, 13): # From Oct 2021, Dec 2021

    fhv_input_path = f"../data/raw/fhv/{year}/{month:02d}/"
    fhv_output_path = f"../data/part/fhv/{year}/{month:02d}/"

    dataset_repartition(
        spark_session = spark,
        n_partitions = 4,
        input_path=fhv_input_path,
        output_path=fhv_output_path,
    )

                                                                                

In [11]:
# Stopping the SparkSession
spark.stop()