In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, lit, concat, avg, countDistinct, upper, sum, max, min

spark = SparkSession.builder.appName('sql').getOrCreate()

25/10/14 12:46:53 WARN Utils: Your hostname, kenans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.0.0.2 instead (on interface en0)
25/10/14 12:46:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/14 12:46:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/14 12:46:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/10/14 12:46:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/10/14 12:46:53 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
def with_full_name(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('full_name', concat(col('first_name'), lit(' '), col('last_name')))


def with_full_name2(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('full_name2', concat('first_name', lit(' is a user of the system')))


df = spark.createDataFrame([
    (1, "John", 'Doe'),
    (2, 'Jane', 'Doe'),
    (3, 'Mike', 'Smith'),
    (4, 'Mary', 'Smith')
], ['id', 'first_name', 'last_name'])
extended_df = df \
    .transform(with_full_name) \
    .transform(with_full_name2)

extended_df.show(truncate=False)

                                                                                

+---+----------+---------+----------+----------------------------+
|id |first_name|last_name|full_name |full_name2                  |
+---+----------+---------+----------+----------------------------+
|1  |John      |Doe      |John Doe  |John is a user of the system|
|2  |Jane      |Doe      |Jane Doe  |Jane is a user of the system|
|3  |Mike      |Smith    |Mike Smith|Mike is a user of the system|
|4  |Mary      |Smith    |Mary Smith|Mary is a user of the system|
+---+----------+---------+----------+----------------------------+



In [4]:
def with_description(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('description',
                                   concat(col('product'),
                                          lit(' is available in a quantity of '),
                                          col('quantity'),
                                          lit('.')))


def with_total_price(df_transform: DataFrame) -> DataFrame:
    return df_transform.withColumn('total_price', col('quantity') * col('price'))


df = spark.createDataFrame([
    (1, 'Apple', 10, 0.5),
    (2, 'Banana', 20, 0.3),
    (3, 'Orange', 30, 0.7),
    (4, 'Kiwi', 40, 1.2),
    (5, 'Peach', 50, 1.5),
    (6, 'Plum', 60, 1.3),
    (7, 'Pear', 70, 0.8),
    (8, 'Pineapple', 80, 2.0),
    (9, 'Grape', 90, 2.5),
    (10, 'Melon', 100, 3.0),
    (11, 'Grapes', 30, 1.1)
], ['id', 'product', 'quantity', 'price'])

extended_df = df \
    .transform(with_total_price) \
    .transform(with_description)

extended_df.show(truncate=False)

+---+---------+--------+-----+-----------+-------------------------------------------+
|id |product  |quantity|price|total_price|description                                |
+---+---------+--------+-----+-----------+-------------------------------------------+
|1  |Apple    |10      |0.5  |5.0        |Apple is available in a quantity of 10.    |
|2  |Banana   |20      |0.3  |6.0        |Banana is available in a quantity of 20.   |
|3  |Orange   |30      |0.7  |21.0       |Orange is available in a quantity of 30.   |
|4  |Kiwi     |40      |1.2  |48.0       |Kiwi is available in a quantity of 40.     |
|5  |Peach    |50      |1.5  |75.0       |Peach is available in a quantity of 50.    |
|6  |Plum     |60      |1.3  |78.0       |Plum is available in a quantity of 60.     |
|7  |Pear     |70      |0.8  |56.0       |Pear is available in a quantity of 70.     |
|8  |Pineapple|80      |2.0  |160.0      |Pineapple is available in a quantity of 80.|
|9  |Grape    |90      |2.5  |225.0      |G

In [5]:
df = spark.createDataFrame([
    (1, 'Apple', 10, 0.5),
    (2, 'Banana', 20, 0.3),
    (3, 'Orange', 30, 0.7),
    (4, 'Kiwi', 40, 1.2),
    (5, 'Peach', 50, 1.5),
    (6, 'Plum', 60, 1.3),
    (7, 'Pear', 70, 0.8),
    (8, 'Pineapple', 80, 2.0),
    (9, 'Grape', 90, 2.5),
    (10, 'Melon', 100, 3.0),
    (11, 'Grapes', 30, 1.1)
], ['id', 'product', 'quantity', 'price'])

# Show the original DataFrame
print("Original DataFrame:")
df.show()

# Select specific columns
print("\nSelect specific columns:")
df.select("product", "quantity").show()

# Filtering data
print("\nFiltering data (quantity > 50):")
df.filter(col("quantity") > 50).show()

# Adding a new column
print("\nAdding a new column (total_price = quantity * price):")
df.withColumn("total_price", col("quantity") * col("price")).show()

# Sorting data
print("\nSorting data (by quantity):")
df.sort(col("quantity").desc()).show()

# Grouping and aggregating data
print("\nGrouping and aggregating data (average price):")
df.groupBy("product").agg(avg("price").alias("avg_price")).show()

# Count distinct values
print("\nCount distinct products:")
df.select(countDistinct("product").alias('count_distinct_product')).show()

# String operations
print("\nString operations (uppercase product names):")
df.withColumn("product_upper", upper(col("product"))).show()

# Aggregate functions
print("\nAggregate functions (sum of quantities):")
df.select(sum("quantity").alias('sum_of_quantities')).show()

# Maximum and minimum
print("\nFinding maximum and minimum price:")
df.select(max("price").alias('max_price'), min("price").alias('min_price')).show()

Original DataFrame:
+---+---------+--------+-----+
| id|  product|quantity|price|
+---+---------+--------+-----+
|  1|    Apple|      10|  0.5|
|  2|   Banana|      20|  0.3|
|  3|   Orange|      30|  0.7|
|  4|     Kiwi|      40|  1.2|
|  5|    Peach|      50|  1.5|
|  6|     Plum|      60|  1.3|
|  7|     Pear|      70|  0.8|
|  8|Pineapple|      80|  2.0|
|  9|    Grape|      90|  2.5|
| 10|    Melon|     100|  3.0|
| 11|   Grapes|      30|  1.1|
+---+---------+--------+-----+


Select specific columns:
+---------+--------+
|  product|quantity|
+---------+--------+
|    Apple|      10|
|   Banana|      20|
|   Orange|      30|
|     Kiwi|      40|
|    Peach|      50|
|     Plum|      60|
|     Pear|      70|
|Pineapple|      80|
|    Grape|      90|
|    Melon|     100|
|   Grapes|      30|
+---------+--------+


Filtering data (quantity > 50):
+---+---------+--------+-----+
| id|  product|quantity|price|
+---+---------+--------+-----+
|  6|     Plum|      60|  1.3|
|  7|     Pear|