In [0]:
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    DateType,
    StringType,
    DecimalType
)

orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("order_date", DateType(), True),
    StructField("status", StringType(), True)
])

products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", DecimalType(), True)
])

users_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("country", StringType(), True),
    StructField("email", StringType(), True)
])

In [0]:
orders = spark.read.format("csv").option("header", "true").schema(orders_schema).load("/Volumes/workspace/test/vol/orders.csv")
products = spark.read.format("csv").option("header", "true").schema(products_schema).load("/Volumes/workspace/test/vol/products.csv")
users = spark.read.format("csv").option("header", "true").schema(users_schema).load("/Volumes/workspace/test/vol/users.csv")



In [0]:
users.show(5)
users.printSchema()
users.columns
users.describe().show()

In [0]:
products.select("name","price").show(10)

In [0]:
users.select(col("name").alias("User Name")).show()

In [0]:
from pyspark.sql.functions import col
orders.filter((col("status") == "Completed") & (col("product_id") == "114")).show()

In [0]:
orders.withColumn("new_status", col("status")).show(5)
users.withColumnRenamed("name", "full_name").show(5)

In [0]:
from pyspark.sql.functions import lit
products.withColumn("net_price",col("price")).show(5)

In [0]:
users.orderBy("age").show(5)
users.orderBy(col("age").desc()).show(5)

In [0]:
orders.select("status").groupBy("status").count().show()

In [0]:
users = users.drop("email")