<a href="https://colab.research.google.com/github/statlib/learn-duckdb/blob/main/notebooks/spark-api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://motherduck.com/blog/making-pyspark-code-faster-with-duckdb/
# https://duckdb.org/docs/api/python/spark_api.html

In [1]:
!pip install --upgrade duckdb

Collecting duckdb
  Downloading duckdb-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: duckdb
  Attempting uninstall: duckdb
    Found existing installation: duckdb 0.8.1
    Uninstalling duckdb-0.8.1:
      Successfully uninstalled duckdb-0.8.1
Successfully installed duckdb-0.9.1


In [23]:
from duckdb.experimental.spark.sql import SparkSession as session
import duckdb.experimental.spark.sql.functions as F
import pandas as pd

In [25]:
spark = session.builder.getOrCreate()

df = pd.DataFrame({
    'age': [34, 45, 23, 56],
    'name': ['Joan', 'Peter', 'John', 'Bob']
})

df = spark.createDataFrame(df)


df = (
    df.withColumn(
        'location', F.lit('Seattle'),
    )
    .withColumn(
        'flag', F.lit(True)
    )
)
res = df.select(
    F.col('age'),
    F.col('name'),
    F.col('location'),
    F.col('flag')
)
print(res)

┌───────┬─────────┬──────────┬─────────┐
│  age  │  name   │ location │  flag   │
│ int64 │ varchar │ varchar  │ boolean │
├───────┼─────────┼──────────┼─────────┤
│    34 │ Joan    │ Seattle  │ true    │
│    45 │ Peter   │ Seattle  │ true    │
│    23 │ John    │ Seattle  │ true    │
│    56 │ Bob     │ Seattle  │ true    │
└───────┴─────────┴──────────┴─────────┘



In [26]:
print(res.collect())

[Row(age=34, name='Joan', location='Seattle', flag=True), Row(age=45, name='Peter', location='Seattle', flag=True), Row(age=23, name='John', location='Seattle', flag=True), Row(age=56, name='Bob', location='Seattle', flag=True)]


In [27]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

┌─────────┬─────────┬───────┬───────┐
│  color  │  fruit  │  v1   │  v2   │
│ varchar │ varchar │ int32 │ int32 │
├─────────┼─────────┼───────┼───────┤
│ red     │ banana  │     1 │    10 │
│ blue    │ banana  │     2 │    20 │
│ red     │ carrot  │     3 │    30 │
│ blue    │ grape   │     4 │    40 │
│ red     │ carrot  │     5 │    50 │
│ black   │ carrot  │     6 │    60 │
│ red     │ banana  │     7 │    70 │
│ red     │ grape   │     8 │    80 │
└─────────┴─────────┴───────┴───────┘



In [29]:
df.groupBy('color').avg().show()

┌─────────┐
│  color  │
│ varchar │
├─────────┤
│ blue    │
│ red     │
│ black   │
└─────────┘



In [40]:
(
    df.filter(
        (F.col("color") == "red") & (F.col("fruit") != "banana"))
        .groupBy(col("color"))
        .agg(
            F.mean(df.v2),
            F.max(df.v2)
        )
)

┌─────────┬────────────────────┬─────────┐
│  color  │      mean(v2)      │ max(v2) │
│ varchar │       double       │  int32  │
├─────────┼────────────────────┼─────────┤
│ red     │ 53.333333333333336 │      80 │
└─────────┴────────────────────┴─────────┘