# Spark Optimization

## Getting Started

- Create Conda Environment

  ```bash
  conda create -n spark python=3.11
  conda update -n base conda
  conda update python
  pip install --upgrade pip
  ```

- Install PySpark

  ```bash
  pip install pyspark==3.5.5
  ```

- Install ipykernel Notebook

  ```bash
  pip install ipykernel
  ```

- Install wget to download remote files for processing

  ```bash
  pip install wget
  ```

In [None]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("spark-optimizations")
    .master("local[*]")
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

In [None]:
import wget
import os

# Sample CSV files - https://github.com/datablist/sample-csv-files?tab=readme-ov-file

# url = "https://dumps.wikimedia.org/other/clickstream/2017-11/clickstream-jawiki-2017-11.tsv.gz"
url = "https://drive.google.com/uc?id=1N1xoxgcw2K3d-49tlchXAWw4wuxLj7EV&export=download"
tmp_dir = "/tmp/"
local_path = tmp_dir + "customers-100000.csv"

if os.path.exists(local_path):
    os.remove(local_path)

wget.download(url, local_path)

In [None]:
# load large dataset from online csv file
df = spark.read.csv(
    "file://" + local_path,
    header=True,
    inferSchema=True,
)

df.show(5)  # show the first 5 rows of the DataFrame

In [None]:
# Cache and Persist Dataframe
from pyspark import StorageLevel

df.cache()  # Cache the DataFrame in memory
df.persist(StorageLevel.MEMORY_AND_DISK)  # Persist the DataFrame in memory and disk
df.unpersist()  # Unpersist the DataFrame from memory and disk

In [None]:
# Broadcast Join

from pyspark.sql.functions import broadcast

df_selfjoin = df.join(broadcast(df), "Customer Id")

df_selfjoin.show(5)  # show the first 5 rows of the DataFrame

In [None]:
from pyspark.sql.functions import when

df.withColumn("First Name", when(df["Index"] > 2, "Phone 1").otherwise("Phone 2")).show(5)