# Basic Structured Operations: `expr` and `selectExpr`

Examples taken from [Spark: The definitive Guide](https://github.com/databricks/Spark-The-Definitive-Guide)

## Imports and Initializing Spark Session

In [1]:
# Imports

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
# Initializing SparkSession

spark = SparkSession.builder.appName("Basic Structured Operations").master("local[*]").getOrCreate()

## Loading Data

In [23]:
# Defining schema 

schema = T.StructType([T.StructField("id", T.IntegerType(), True),
                       T.StructField("savings", T.IntegerType(), True),
                       T.StructField("debt", T.IntegerType(), True)])

In [24]:
# Loading data

df = spark.read.csv("../data/balance.csv", sep=",", header=True, schema=schema)

In [25]:
# Show data

df.show()

+---+-------+------+
| id|savings|  debt|
+---+-------+------+
|  1| 125365| 54821|
|  2|  54872|   254|
|  3|    125| 12548|
|  4| 587465|635548|
|  5|  45879| 69852|
|  6|  12465|     0|
|  7|  25312| 12596|
|  8| 785236|953124|
|  9| 125963|256325|
| 10|  46325|  1289|
+---+-------+------+



In [26]:
# Show schema

df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- savings: integer (nullable = true)
 |-- debt: integer (nullable = true)



## Some Examples Using `expr` and `selectExpr`

### Select all columns and an additional one with the net balance (savings - debt)

In [28]:
# Mode 1: Using DataFrame API

df.withColumn("net_balance", F.col("savings") - F.col("debt")).show()

+---+-------+------+-----------+
| id|savings|  debt|net_balance|
+---+-------+------+-----------+
|  1| 125365| 54821|      70544|
|  2|  54872|   254|      54618|
|  3|    125| 12548|     -12423|
|  4| 587465|635548|     -48083|
|  5|  45879| 69852|     -23973|
|  6|  12465|     0|      12465|
|  7|  25312| 12596|      12716|
|  8| 785236|953124|    -167888|
|  9| 125963|256325|    -130362|
| 10|  46325|  1289|      45036|
+---+-------+------+-----------+



In [32]:
# Mode 2: Using `expr`

df.select(F.expr("*"), F.expr("savings-debt as net_balance")).show()

+---+-------+------+-----------+
| id|savings|  debt|net_balance|
+---+-------+------+-----------+
|  1| 125365| 54821|      70544|
|  2|  54872|   254|      54618|
|  3|    125| 12548|     -12423|
|  4| 587465|635548|     -48083|
|  5|  45879| 69852|     -23973|
|  6|  12465|     0|      12465|
|  7|  25312| 12596|      12716|
|  8| 785236|953124|    -167888|
|  9| 125963|256325|    -130362|
| 10|  46325|  1289|      45036|
+---+-------+------+-----------+



In [34]:
# Mode 3: Using `selectExpr`

df.selectExpr("*", "savings-debt as net_balance").show()

+---+-------+------+-----------+
| id|savings|  debt|net_balance|
+---+-------+------+-----------+
|  1| 125365| 54821|      70544|
|  2|  54872|   254|      54618|
|  3|    125| 12548|     -12423|
|  4| 587465|635548|     -48083|
|  5|  45879| 69852|     -23973|
|  6|  12465|     0|      12465|
|  7|  25312| 12596|      12716|
|  8| 785236|953124|    -167888|
|  9| 125963|256325|    -130362|
| 10|  46325|  1289|      45036|
+---+-------+------+-----------+

