In [None]:
from pyspark.sql import SparkSession

APP_NAME = "Tutorial: Jupyter Application"
MASTER_URL = "spark://master:7077"
DRIVER_HOST = "jupyter"
DRIVER_BIND_ADDRESS = "0.0.0.0"

session = SparkSession.builder \
    .appName(APP_NAME) \
    .master(MASTER_URL) \
    .config("spark.driver.host", DRIVER_HOST) \
    .config("spark.driver.bindAddress", DRIVER_BIND_ADDRESS) \
    .getOrCreate()
print("Spark session established.")

columns = ["row", "number"]
data = [("row1", 1), ("row2", 2), ("row3", 3)]
data_frame = session.createDataFrame(data, schema=columns)
data_frame.show()

session.stop()
print("Spark session closed.")

In [None]:
import pandas as pd
from IPython.display import display

def print_table(data_frame, title="Styled Data Table"):
    df = data_frame.toPandas()
    
    styled_table = (
        df.style.set_table_styles(
            [
                {"selector": "thead th", "props": [("background-color", "#4CAF50"), ("color", "black"), ("text-align", "center"), ("padding", "10px")]},
                {"selector": "tbody td", "props": [("border", "1px solid #ddd"), ("text-align", "center"), ("padding", "5px")]},
                {"selector": "caption", "props": [("caption-side", "top"), ("font-size", "24px"), ("font-weight", "bold"), ("text-align", "left")]},
            ]
        )
        .set_caption(title)
        .apply(
            lambda x: ["background-color: white" if i % 2 == 0 else "background-color: #d4f7dc" for i in range(len(x))],
        )
        .hide(axis="index")
    )
    
    display(styled_table)


In [None]:
from pyspark.sql import SparkSession


APP_NAME = "Tutorial: DataFrame Basic Operation"
MASTER_URL = "spark://master:7077"
DRIVER_HOST = "jupyter"
DRIVER_BIND_ADDRESS = "0.0.0.0"

session = SparkSession.builder \
    .appName(APP_NAME) \
    .master(MASTER_URL) \
    .config("spark.driver.host", DRIVER_HOST) \
    .config("spark.driver.bindAddress", DRIVER_BIND_ADDRESS) \
    .getOrCreate()
session.sparkContext.setLogLevel("WARN")
print("Spark session established.")

csv_path = "/resources/persons.csv"
print(f"Reading CSV file from: {csv_path}")
data_frame = session.read.options(header=True, inferSchema=True).csv(csv_path)
data_frame.createOrReplaceTempView("persons")

print_table(data_frame.limit(10), "Persons")


session.stop();
print("Spark session closed.")